Extract Tagged Content from PDF
In this article you will learn how to to extract tagged content PDF document using C#.
The following code snippet also work with Aspose.PDF.Drawing library.
Getting Tagged PDF Content
In order to get content of PDF Document with Tagged Text, Aspose.PDF offers TaggedContent property of Document class.
Following code snippet shows how to get content of a PDF document with Tagged Text:
Getting Root Structure
In order to get the root structure of Tagged PDF Document, Aspose.PDF offers StructTreeRootElement property of ITaggedContent interface and StructureElement . Following code snippet shows how to get the root structure of Tagged PDF Document:
Accessing Child Elements
In order to access child elements of a Tagged PDF Document, Aspose.PDF offers ElementList class. Following code snippet shows how to access child elements of a Tagged PDF Document:
Tagging Images in Existing PDF
In order to tag images in existing PDF document, Aspose.PDF offers FindElements method of StructureElement class. You can add alternative text for figures using AlternativeText property of FigureElement class.
Following code snippet shows how to tag images in existing PDF document:
.NET Core 3.1
// For complete examples and data files, visit https://github.com/aspose-pdf/Aspose.PDF-for-.NET
private static void TagImages ()
{
// The path to the documents directory
string dataDir = RunExamples . GetDataDir_AsposePdf_WorkingDocuments ();
string inFile = dataDir + "TH.pdf" ;
string outFile = dataDir + "TH_out.pdf" ;
string logFile = dataDir + "TH_out.xml" ;
// Open document
using ( var document1 = new Aspose . Pdf . Document ( inFile ))
{
// Gets tagged content and root structure element
Aspose . Pdf . Tagged . ITaggedContent taggedContent = document1 . TaggedContent ;
Aspose . Pdf . LogicalStructure . StructureElement rootElement = taggedContent . RootElement ;
// Set title for tagged PDF document
taggedContent . SetTitle ( "Document with images" );
foreach ( Aspose . Pdf . LogicalStructure . FigureElement figureElement in rootElement . FindElements < Aspose . Pdf . LogicalStructure . FigureElement >( true ))
{
// Set AlternativeText for Figure
figureElement . AlternativeText = "Figure alternative text (technique 2)" ;
// Create and Set BBox Attribute
var bboxAttribute = new Aspose . Pdf . LogicalStructure . StructureAttribute ( Aspose . Pdf . LogicalStructure . AttributeKey . BBox );
bboxAttribute . SetRectangleValue ( new Aspose . Pdf . Rectangle ( 0.0 , 0.0 , 100.0 , 100.0 ));
Aspose . Pdf . LogicalStructure . StructureAttributes figureLayoutAttributes = figureElement . Attributes . GetAttributes ( Aspose . Pdf . LogicalStructure . AttributeOwnerStandard . Layout );
figureLayoutAttributes . SetAttribute ( bboxAttribute );
}
// Move Span Element into Paragraph (find wrong span and paragraph in first TD)
Aspose . Pdf . LogicalStructure . TableElement tableElement = rootElement . FindElements < Aspose . Pdf . LogicalStructure . TableElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . SpanElement spanElement = tableElement . FindElements < Aspose . Pdf . LogicalStructure . SpanElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . TableTDElement firstTdElement = tableElement . FindElements < Aspose . Pdf . LogicalStructure . TableTDElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . ParagraphElement paragraph = firstTdElement . FindElements < Aspose . Pdf . LogicalStructure . ParagraphElement >( true )[ 0 ];
// Move Span Element into Paragraph
spanElement . ChangeParentElement ( paragraph );
// Save document
document1 . Save ( outFile );
}
// Checking PDF/UA Compliance for out document
using ( var document2 = new Aspose . Pdf . Document ( outFile ))
{
bool isPdfUaCompliance = document2 . Validate ( logFile , Aspose . Pdf . PdfFormat . PDF_UA_1 );
Console . WriteLine ( String . Format ( "PDF/UA compliance: {0}" , isPdfUaCompliance ));
}
}
.NET 8
// For complete examples and data files, visit https://github.com/aspose-pdf/Aspose.PDF-for-.NET
private static void TagImages ()
{
// The path to the documents directory
string dataDir = RunExamples . GetDataDir_AsposePdf_WorkingDocuments ();
string inFile = dataDir + "TH.pdf" ;
string outFile = dataDir + "TH_out.pdf" ;
string logFile = dataDir + "TH_out.xml" ;
// Open document
using var document1 = new Aspose . Pdf . Document ( inFile );
// Gets tagged content and root structure element
Aspose . Pdf . Tagged . ITaggedContent taggedContent = document1 . TaggedContent ;
Aspose . Pdf . LogicalStructure . StructureElement rootElement = taggedContent . RootElement ;
// Set title for tagged PDF document
taggedContent . SetTitle ( "Document with images" );
foreach ( Aspose . Pdf . LogicalStructure . FigureElement figureElement in rootElement . FindElements < Aspose . Pdf . LogicalStructure . FigureElement >( true ))
{
// Set AlternativeText for Figure
figureElement . AlternativeText = "Figure alternative text (technique 2)" ;
// Create and Set BBox Attribute
var bboxAttribute = new Aspose . Pdf . LogicalStructure . StructureAttribute ( Aspose . Pdf . LogicalStructure . AttributeKey . BBox );
bboxAttribute . SetRectangleValue ( new Aspose . Pdf . Rectangle ( 0.0 , 0.0 , 100.0 , 100.0 ));
Aspose . Pdf . LogicalStructure . StructureAttributes figureLayoutAttributes = figureElement . Attributes . GetAttributes ( Aspose . Pdf . LogicalStructure . AttributeOwnerStandard . Layout );
figureLayoutAttributes . SetAttribute ( bboxAttribute );
}
// Move Span Element into Paragraph (find wrong span and paragraph in first TD)
Aspose . Pdf . LogicalStructure . TableElement tableElement = rootElement . FindElements < Aspose . Pdf . LogicalStructure . TableElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . SpanElement spanElement = tableElement . FindElements < Aspose . Pdf . LogicalStructure . SpanElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . TableTDElement firstTdElement = tableElement . FindElements < Aspose . Pdf . LogicalStructure . TableTDElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . ParagraphElement paragraph = firstTdElement . FindElements < Aspose . Pdf . LogicalStructure . ParagraphElement >( true )[ 0 ];
// Move Span Element into Paragraph
spanElement . ChangeParentElement ( paragraph );
// Save document
document1 . Save ( outFile );
// Checking PDF/UA Compliance for out document
using var document2 = new Aspose . Pdf . Document ( outFile );
bool isPdfUaCompliance = document2 . Validate ( logFile , Aspose . Pdf . PdfFormat . PDF_UA_1 );
Console . WriteLine ( String . Format ( "PDF/UA compliance: {0}" , isPdfUaCompliance ));
}