Extrair Conteúdo Marcado de PDF
Neste artigo, você aprenderá como extrair conteúdo marcado de documentos PDF usando C#.
O seguinte trecho de código também funciona com a biblioteca Aspose.PDF.Drawing .
Obtendo Conteúdo PDF Marcado
Para obter o conteúdo de um Documento PDF com Texto Marcado, a Aspose.PDF oferece a propriedade TaggedContent da classe Document .
O seguinte trecho de código mostra como obter o conteúdo de um documento PDF com Texto Marcado:
Obtendo Estrutura Raiz
Para obter a estrutura raiz do Documento PDF Marcado, a Aspose.PDF oferece a propriedade StructTreeRootElement da interface ITaggedContent e StructureElement . O seguinte trecho de código mostra como obter a estrutura raiz do Documento PDF Marcado:
Acessando Elementos Filhos
Para acessar elementos filhos de um Documento PDF Marcado, a Aspose.PDF oferece a classe ElementList . O seguinte trecho de código mostra como acessar elementos filhos de um Documento PDF Marcado:
Marcando Imagens em PDF Existente
Para marcar imagens em um documento PDF existente, a Aspose.PDF oferece o método FindElements da classe StructureElement . Você pode adicionar texto alternativo para figuras usando a propriedade AlternativeText da classe FigureElement .
O seguinte trecho de código mostra como marcar imagens em um documento PDF existente:
.NET Core 3.1
Copy
private static void TagImages ( )
{
var dataDir = RunExamples . GetDataDir_AsposePdf_WorkingDocuments ();
using ( var document1 = new Aspose . Pdf . Document ( dataDir + "TH.pdf" ))
{
Aspose . Pdf . Tagged . ITaggedContent taggedContent = document1 . TaggedContent ;
Aspose . Pdf . LogicalStructure . StructureElement rootElement = taggedContent . RootElement ;
taggedContent . SetTitle ( "Document with images" );
foreach ( Aspose . Pdf . LogicalStructure . FigureElement figureElement in rootElement . FindElements < Aspose . Pdf . LogicalStructure . FigureElement >( true ))
{
figureElement . AlternativeText = "Figure alternative text (technique 2)" ;
var bboxAttribute = new Aspose . Pdf . LogicalStructure . StructureAttribute ( Aspose . Pdf . LogicalStructure . AttributeKey . BBox );
bboxAttribute . SetRectangleValue ( new Aspose . Pdf . Rectangle ( 0.0 , 0.0 , 100.0 , 100.0 ));
Aspose . Pdf . LogicalStructure . StructureAttributes figureLayoutAttributes = figureElement . Attributes . GetAttributes ( Aspose . Pdf . LogicalStructure . AttributeOwnerStandard . Layout );
figureLayoutAttributes . SetAttribute ( bboxAttribute );
}
Aspose . Pdf . LogicalStructure . TableElement tableElement = rootElement . FindElements < Aspose . Pdf . LogicalStructure . TableElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . SpanElement spanElement = tableElement . FindElements < Aspose . Pdf . LogicalStructure . SpanElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . TableTDElement firstTdElement = tableElement . FindElements < Aspose . Pdf . LogicalStructure . TableTDElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . ParagraphElement paragraph = firstTdElement . FindElements < Aspose . Pdf . LogicalStructure . ParagraphElement >( true )[ 0 ];
spanElement . ChangeParentElement ( paragraph );
document1 . Save ( dataDir + "TH_out.pdf" );
}
using ( var document2 = new Aspose . Pdf . Document ( dataDir + "TH_out.pdf" ))
{
bool isPdfUaCompliance = document2 . Validate ( dataDir + "TH_out.xml" , Aspose . Pdf . PdfFormat . PDF_UA_1 );
Console . WriteLine ( String . Format ( "PDF/UA compliance: {0}" , isPdfUaCompliance ));
}
}
.NET 8
Copy
private static void TagImages ( )
{
var dataDir = RunExamples . GetDataDir_AsposePdf_WorkingDocuments ();
using var document1 = new Aspose . Pdf . Document ( dataDir + "TH.pdf" );
Aspose . Pdf . Tagged . ITaggedContent taggedContent = document1 . TaggedContent ;
Aspose . Pdf . LogicalStructure . StructureElement rootElement = taggedContent . RootElement ;
taggedContent . SetTitle ( "Document with images" );
foreach ( Aspose . Pdf . LogicalStructure . FigureElement figureElement in rootElement . FindElements < Aspose . Pdf . LogicalStructure . FigureElement >( true ))
{
figureElement . AlternativeText = "Figure alternative text (technique 2)" ;
var bboxAttribute = new Aspose . Pdf . LogicalStructure . StructureAttribute ( Aspose . Pdf . LogicalStructure . AttributeKey . BBox );
bboxAttribute . SetRectangleValue ( new Aspose . Pdf . Rectangle ( 0.0 , 0.0 , 100.0 , 100.0 ));
Aspose . Pdf . LogicalStructure . StructureAttributes figureLayoutAttributes = figureElement . Attributes . GetAttributes ( Aspose . Pdf . LogicalStructure . AttributeOwnerStandard . Layout );
figureLayoutAttributes . SetAttribute ( bboxAttribute );
}
Aspose . Pdf . LogicalStructure . TableElement tableElement = rootElement . FindElements < Aspose . Pdf . LogicalStructure . TableElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . SpanElement spanElement = tableElement . FindElements < Aspose . Pdf . LogicalStructure . SpanElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . TableTDElement firstTdElement = tableElement . FindElements < Aspose . Pdf . LogicalStructure . TableTDElement >( true )[ 0 ];
Aspose . Pdf . LogicalStructure . ParagraphElement paragraph = firstTdElement . FindElements < Aspose . Pdf . LogicalStructure . ParagraphElement >( true )[ 0 ];
spanElement . ChangeParentElement ( paragraph );
document1 . Save ( dataDir + "TH_out.pdf" );
using var document2 = new Aspose . Pdf . Document ( dataDir + "TH_out.pdf" );
bool isPdfUaCompliance = document2 . Validate ( dataDir + "TH_out.pdf" , Aspose . Pdf . PdfFormat . PDF_UA_1 );
Console . WriteLine ( String . Format ( "PDF/UA compliance: {0}" , isPdfUaCompliance ));
}