استخراج فقرة من PDF
Contents
[
Hide
]
استخراج النص من مستند PDF في شكل فقرات
يمكننا الحصول على النص من مستند PDF عن طريق البحث عن نص معين (باستخدام “النص العادي” أو “التعبيرات العادية”) من صفحة واحدة أو المستند بأكمله، أو يمكننا الحصول على النص الكامل لصفحة واحدة، نطاق من الصفحات أو المستند الكامل.
```cpp
static void DrawRectangleOnPage(System::SmartPtr<Rectangle> rectangle, System::SmartPtr<Page> page);
static void DrawPolygonOnPage(System::ArrayPtr<System::SmartPtr<Point>> polygon, System::SmartPtr<Page> page);
void Parsing::ExtractParagraph()
{
// The path to the documents directory.
std::clog << __func__ << ": Start" << std::endl;
// String for path name
String _dataDir("C:\\Samples\\Parsing\\");
auto doc = MakeObject<Document>(_dataDir + u"sample.pdf");
auto page = doc->get_Pages()->idx_get(1);
auto absorber = MakeObject<ParagraphAbsorber>();
absorber->Visit(page);
auto markup = absorber->get_PageMarkups()->idx_get(0);
for(auto §ion : markup->get_Sections())
{
DrawRectangleOnPage(section->get_Rectangle(), page);
for(auto ¶graph : section->get_Paragraphs())
{
DrawPolygonOnPage(paragraph->get_Points(), page);
}
}
doc->Save(_dataDir + u"output_out.pdf");
}
void DrawRectangleOnPage(System::SmartPtr<Rectangle> rectangle, System::SmartPtr<Page> page)
{
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::GSave>());
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::ConcatenateMatrix>(1, 0, 0, 1, 0, 0));
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::SetRGBColorStroke>(0, 1, 0));
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::SetLineWidth>(2));
page->get_Contents()->Add(
MakeObject<Aspose::Pdf::Operators::Re>(
rectangle->get_LLX(),
rectangle->get_LLY(),
rectangle->get_Width(),
rectangle->get_Height()));
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::ClosePathStroke>());
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::GRestore>());
}
void DrawPolygonOnPage(System::ArrayPtr<System::SmartPtr<Point>> polygon, System::SmartPtr<Page> page)
{
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::GSave>());
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::ConcatenateMatrix>(1, 0, 1, 0, 0, 0));
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::SetRGBColorStroke>(0, 0, 1));
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::SetLineWidth>(1));
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::MoveTo>(polygon->idx_get(0)->get_X(), polygon[0]->get_Y()));
for (int i = 1; i < polygon->get_Length(); i++)
{
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::LineTo>(polygon->idx_get(i)->get_X(), polygon[i]->get_Y()));
}
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::LineTo>(polygon->idx_get(0)->get_X(), polygon[0]->get_Y()));
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::ClosePathStroke>());
page->get_Contents()->Add(MakeObject<Aspose::Pdf::Operators::GRestore>());
}