Extrair Tabela de Documento PDF
Contents
[
Hide
]
O seguinte trecho de código também funciona com a biblioteca Aspose.PDF.Drawing.
Extrair Tabela de PDF
public static void Extract_Table()
{
// Carregar o documento PDF de origem
Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(@"c:\tmp\the_worlds_cities_in_2018_data_booklet 7.pdf");
foreach (var page in pdfDocument.Pages)
{
Aspose.Pdf.Text.TableAbsorber absorber = new Aspose.Pdf.Text.TableAbsorber();
absorber.Visit(page);
foreach (AbsorbedTable table in absorber.TableList)
{
foreach (AbsorbedRow row in table.RowList)
{
foreach (AbsorbedCell cell in row.CellList)
{
TextFragment textfragment = new TextFragment();
TextFragmentCollection textFragmentCollection = cell.TextFragments;
foreach (TextFragment fragment in textFragmentCollection)
{
string txt = "";
foreach (TextSegment seg in fragment.Segments)
{
txt += seg.Text;
}
Console.WriteLine(txt);
}
}
}
}
}
}
Extrair borda da tabela como imagem
As bordas da página são operações de desenho de caminho. Portanto, a lógica de processamento de Pdf para Html apenas executa instruções de desenho e coloca o plano de fundo atrás do texto. Assim, para repetir a lógica, você tem que processar operadores de conteúdo manualmente e desenhar os gráficos você mesmo. Observe também que o seguinte trecho de código pode não funcionar com precisão para vários arquivos PDF, mas se você encontrar algum problema, sinta-se à vontade para entrar em contato. Este código foi desenvolvido para arquivos PDF específicos. O seguinte trecho de código mostra os passos para extrair a borda da tabela como imagem de um documento PDF.
// Para exemplos completos e arquivos de dados, por favor, vá para https://github.com/aspose-pdf/Aspose.PDF-for-.NET
// O caminho para o diretório de documentos.
string dataDir = RunExamples.GetDataDir_AsposePdf_Tables();
Document doc = new Document(dataDir + "input.pdf");
Stack graphicsState = new Stack();
System.Drawing.Bitmap bitmap = new System.Drawing.Bitmap((int)doc.Pages[1].PageInfo.Width, (int)doc.Pages[1].PageInfo.Height);
System.Drawing.Drawing2D.GraphicsPath graphicsPath = new System.Drawing.Drawing2D.GraphicsPath();
// Valor padrão da matriz ctm é 1,0,0,1,0,0
System.Drawing.Drawing2D.Matrix lastCTM = new System.Drawing.Drawing2D.Matrix(1, 0, 0, -1, 0, 0);
// O sistema de coordenadas do System.Drawing é baseado no canto superior esquerdo, enquanto o sistema de coordenadas do pdf é baseado no canto inferior esquerdo, então temos que aplicar a matriz de inversão
System.Drawing.Drawing2D.Matrix inversionMatrix = new System.Drawing.Drawing2D.Matrix(1, 0, 0, -1, 0, (float)doc.Pages[1].PageInfo.Height);
System.Drawing.PointF lastPoint = new System.Drawing.PointF(0, 0);
System.Drawing.Color fillColor = System.Drawing.Color.FromArgb(0, 0, 0);
System.Drawing.Color strokeColor = System.Drawing.Color.FromArgb(0, 0, 0);
using (System.Drawing.Graphics gr = System.Drawing.Graphics.FromImage(bitmap))
{
gr.SmoothingMode = SmoothingMode.HighQuality;
graphicsState.Push(new System.Drawing.Drawing2D.Matrix(1, 0, 0, 1, 0, 0));
// Processar todos os comandos de conteúdo
foreach (Operator op in doc.Pages[1].Contents)
{
Aspose.Pdf.Operators.GSave opSaveState = op as Aspose.Pdf.Operators.GSave;
Aspose.Pdf.Operators.GRestore opRestoreState = op as Aspose.Pdf.Operators.GRestore;
Aspose.Pdf.Operators.ConcatenateMatrix opCtm = op as Aspose.Pdf.Operators.ConcatenateMatrix;
Aspose.Pdf.Operators.MoveTo opMoveTo = op as Aspose.Pdf.Operators.MoveTo;
Aspose.Pdf.Operators.LineTo opLineTo = op as Aspose.Pdf.Operators.LineTo;
Aspose.Pdf.Operators.Re opRe = op as Aspose.Pdf.Operators.Re;
Aspose.Pdf.Operators.EndPath opEndPath = op as Aspose.Pdf.Operators.EndPath;
Aspose.Pdf.Operators.Stroke opStroke = op as Aspose.Pdf.Operators.Stroke;
Aspose.Pdf.Operators.Fill opFill = op as Aspose.Pdf.Operators.Fill;
Aspose.Pdf.Operators.EOFill opEOFill = op as Aspose.Pdf.Operators.EOFill;
Aspose.Pdf.Operators.SetRGBColor opRGBFillColor = op as Aspose.Pdf.Operators.SetRGBColor;
Aspose.Pdf.Operators.SetRGBColorStroke opRGBStrokeColor = op as Aspose.Pdf.Operators.SetRGBColorStroke;
if (opSaveState != null)
{
// Salvar estado anterior e empurrar estado atual para o topo da pilha
graphicsState.Push(((System.Drawing.Drawing2D.Matrix)graphicsState.Peek()).Clone());
lastCTM = (System.Drawing.Drawing2D.Matrix)graphicsState.Peek();
}
else if (opRestoreState != null)
{
// Descartar estado atual e restaurar o anterior
graphicsState.Pop();
lastCTM = (System.Drawing.Drawing2D.Matrix)graphicsState.Peek();
}
else if (opCtm != null)
{
System.Drawing.Drawing2D.Matrix cm = new System.Drawing.Drawing2D.Matrix(
(float)opCtm.Matrix.A,
(float)opCtm.Matrix.B,
(float)opCtm.Matrix.C,
(float)opCtm.Matrix.D,
(float)opCtm.Matrix.E,
(float)opCtm.Matrix.F);
// Multiplicar matriz atual com a matriz de estado
((System.Drawing.Drawing2D.Matrix)graphicsState.Peek()).Multiply(cm);
lastCTM = (System.Drawing.Drawing2D.Matrix)graphicsState.Peek();
}
else if (opMoveTo != null)
{
lastPoint = new System.Drawing.PointF((float)opMoveTo.X, (float)opMoveTo.Y);
}
else if (opLineTo != null)
{
System.Drawing.PointF linePoint = new System.Drawing.PointF((float)opLineTo.X, (float)opLineTo.Y);
graphicsPath.AddLine(lastPoint, linePoint);
lastPoint = linePoint;
}
else if (opRe != null)
{
System.Drawing.RectangleF re = new System.Drawing.RectangleF((float)opRe.X, (float)opRe.Y, (float)opRe.Width, (float)opRe.Height);
graphicsPath.AddRectangle(re);
}
else if (opEndPath != null)
{
graphicsPath = new System.Drawing.Drawing2D.GraphicsPath();
}
else if (opRGBFillColor != null)
{
fillColor = opRGBFillColor.getColor();
}
else if (opRGBStrokeColor != null)
{
strokeColor = opRGBStrokeColor.getColor();
}
else if (opStroke != null)
{
graphicsPath.Transform(lastCTM);
graphicsPath.Transform(inversionMatrix);
gr.DrawPath(new System.Drawing.Pen(strokeColor), graphicsPath);
graphicsPath = new System.Drawing.Drawing2D.GraphicsPath();
}
else if (opFill != null)
{
graphicsPath.FillMode = FillMode.Winding;
graphicsPath.Transform(lastCTM);
graphicsPath.Transform(inversionMatrix);
gr.FillPath(new System.Drawing.SolidBrush(fillColor), graphicsPath);
graphicsPath = new System.Drawing.Drawing2D.GraphicsPath();
}
else if (opEOFill != null)
{
graphicsPath.FillMode = FillMode.Alternate;
graphicsPath.Transform(lastCTM);
graphicsPath.Transform(inversionMatrix);
gr.FillPath(new System.Drawing.SolidBrush(fillColor), graphicsPath);
graphicsPath = new System.Drawing.Drawing2D.GraphicsPath();
}
}
}
dataDir = dataDir + "ExtractBorder_out.png";
bitmap.Save(dataDir, ImageFormat.Png);