Extract Table from PDF Document
Contents
[
Hide
]
The following code snippet also work with Aspose.PDF.Drawing library.
Extract Table from PDF
public static void Extract_Table()
{
// Load source PDF document
Document document = new Document(@"c:\tmp\the_worlds_cities_in_2018_data_booklet 7.pdf");
foreach (var page in document.Pages)
{
Aspose.Pdf.Text.TableAbsorber absorber = new Aspose.Pdf.Text.TableAbsorber();
absorber.Visit(page);
foreach (AbsorbedTable table in absorber.TableList)
{
foreach (AbsorbedRow row in table.RowList)
{
foreach (AbsorbedCell cell in row.CellList)
{
TextFragment textfragment = new TextFragment();
TextFragmentCollection textFragmentCollection = cell.TextFragments;
foreach (TextFragment fragment in textFragmentCollection)
{
string txt = "";
foreach (TextSegment seg in fragment.Segments)
{
txt += seg.Text;
}
Console.WriteLine(txt);
}
}
}
}
}
}
Extract table border as Image
The page borders are path drawing operations. Therefore the Pdf->Html processing logic just performs drawing instructions and places the background behind the text. So, to repeat the logic, you have to process contents operators manually and draw the graphics yourself. Also please note that following code snippet might not work accurately for various PDF files but if you encounter any issue, please feel free to contact. This code was developed for specific PDF files. The following code snippet shows the steps to extract the table border as Image from PDF document.
// For complete examples and data files, please go to https://github.com/aspose-pdf/Aspose.PDF-for-.NET
// The path to the documents directory.
string dataDir = RunExamples.GetDataDir_AsposePdf_Tables();
Document document = new Document(dataDir + "input.pdf");
Stack graphicsState = new Stack();
System.Drawing.Bitmap bitmap = new System.Drawing.Bitmap((int)document.Pages[1].PageInfo.Width, (int)document.Pages[1].PageInfo.Height);
System.Drawing.Drawing2D.GraphicsPath graphicsPath = new System.Drawing.Drawing2D.GraphicsPath();
// Default ctm matrix value is 1,0,0,1,0,0
System.Drawing.Drawing2D.Matrix lastCTM = new System.Drawing.Drawing2D.Matrix(1, 0, 0, -1, 0, 0);
// System.Drawing coordinate system is top left based, while pdf coordinate system is low left based, so we have to apply the inversion matrix
System.Drawing.Drawing2D.Matrix inversionMatrix = new System.Drawing.Drawing2D.Matrix(1, 0, 0, -1, 0, (float)document.Pages[1].PageInfo.Height);
System.Drawing.PointF lastPoint = new System.Drawing.PointF(0, 0);
System.Drawing.Color fillColor = System.Drawing.Color.FromArgb(0, 0, 0);
System.Drawing.Color strokeColor = System.Drawing.Color.FromArgb(0, 0, 0);
using (System.Drawing.Graphics gr = System.Drawing.Graphics.FromImage(bitmap))
{
gr.SmoothingMode = SmoothingMode.HighQuality;
graphicsState.Push(new System.Drawing.Drawing2D.Matrix(1, 0, 0, 1, 0, 0));
// Process all the contents commands
foreach (Operator op in document.Pages[1].Contents)
{
Operators.GSave opSaveState = op as Operators.GSave;
Operators.GRestore opRestoreState = op as Operators.GRestore;
Operators.ConcatenateMatrix opCtm = op as Operators.ConcatenateMatrix;
Operators.MoveTo opMoveTo = op as Operators.MoveTo;
Operators.LineTo opLineTo = op as Operators.LineTo;
Operators.Re opRe = op as Operators.Re;
Operators.EndPath opEndPath = op as Operators.EndPath;
Operators.Stroke opStroke = op as Operators.Stroke;
Operators.Fill opFill = op as Operators.Fill;
Operators.EOFill opEOFill = op as Operators.EOFill;
Operators.SetRGBColor opRGBFillColor = op as Operators.SetRGBColor;
Operators.SetRGBColorStroke opRGBStrokeColor = op as Operators.SetRGBColorStroke;
if (opSaveState != null)
{
// Save previous state and push current state to the top of the stack
graphicsState.Push(((System.Drawing.Drawing2D.Matrix)graphicsState.Peek()).Clone());
lastCTM = (System.Drawing.Drawing2D.Matrix)graphicsState.Peek();
}
else if (opRestoreState != null)
{
// Throw away current state and restore previous one
graphicsState.Pop();
lastCTM = (System.Drawing.Drawing2D.Matrix)graphicsState.Peek();
}
else if (opCtm != null)
{
System.Drawing.Drawing2D.Matrix cm = new System.Drawing.Drawing2D.Matrix(
(float)opCtm.Matrix.A,
(float)opCtm.Matrix.B,
(float)opCtm.Matrix.C,
(float)opCtm.Matrix.D,
(float)opCtm.Matrix.E,
(float)opCtm.Matrix.F);
// Multiply current matrix with the state matrix
((System.Drawing.Drawing2D.Matrix)graphicsState.Peek()).Multiply(cm);
lastCTM = (System.Drawing.Drawing2D.Matrix)graphicsState.Peek();
}
else if (opMoveTo != null)
{
lastPoint = new System.Drawing.PointF((float)opMoveTo.X, (float)opMoveTo.Y);
}
else if (opLineTo != null)
{
System.Drawing.PointF linePoint = new System.Drawing.PointF((float)opLineTo.X, (float)opLineTo.Y);
graphicsPath.AddLine(lastPoint, linePoint);
lastPoint = linePoint;
}
else if (opRe != null)
{
System.Drawing.RectangleF re = new System.Drawing.RectangleF((float)opRe.X, (float)opRe.Y, (float)opRe.Width, (float)opRe.Height);
graphicsPath.AddRectangle(re);
}
else if (opEndPath != null)
{
graphicsPath = new System.Drawing.Drawing2D.GraphicsPath();
}
else if (opRGBFillColor != null)
{
fillColor = opRGBFillColor.getColor();
}
else if (opRGBStrokeColor != null)
{
strokeColor = opRGBStrokeColor.getColor();
}
else if (opStroke != null)
{
graphicsPath.Transform(lastCTM);
graphicsPath.Transform(inversionMatrix);
gr.DrawPath(new System.Drawing.Pen(strokeColor), graphicsPath);
graphicsPath = new System.Drawing.Drawing2D.GraphicsPath();
}
else if (opFill != null)
{
graphicsPath.FillMode = FillMode.Winding;
graphicsPath.Transform(lastCTM);
graphicsPath.Transform(inversionMatrix);
gr.FillPath(new System.Drawing.SolidBrush(fillColor), graphicsPath);
graphicsPath = new System.Drawing.Drawing2D.GraphicsPath();
}
else if (opEOFill != null)
{
graphicsPath.FillMode = FillMode.Alternate;
graphicsPath.Transform(lastCTM);
graphicsPath.Transform(inversionMatrix);
gr.FillPath(new System.Drawing.SolidBrush(fillColor), graphicsPath);
graphicsPath = new System.Drawing.Drawing2D.GraphicsPath();
}
}
}
bitmap.Save(dataDir + "ExtractBorder_out.png", ImageFormat.Png);