Extract Text from PDF using Python
Contents
[
Hide
]
Extract Text from PDF Document
This example converts PDF content into plain text, which can be used for further text analysis, search indexing, or data extraction.
- Load the PDF Document
- Initialize a Text Absorber
- Extract Text from All Pages
- Write the Extracted Text to a File
import aspose.pdf as apdf
from io import FileIO
from os import path
import json
from aspose.pycore import cast, is_assignable
path_infile = path.join(self.dataDir, infile)
path_outfile = path.join(self.dataDir, outfile)
document = apdf.Document(path_infile)
textAbsorber = apdf.text.TextAbsorber()
document.pages.accept(textAbsorber)
with open(path_outfile, "w", encoding="utf-8") as file:
file.write(textAbsorber.text)
Extract Highlighted Text from PDF Document
This code snippet extracts highlighted text from a PDF document, which can help review key points or summarize content:
import aspose.pdf as apdf
from io import FileIO
from os import path
import json
from aspose.pycore import cast, is_assignable
path_infile = path.join(self.dataDir, infile)
document = apdf.Document(path_infile)
page = document.pages[1]
for annotation in page.annotations:
if is_assignable(annotation, apdf.annotations.HighlightAnnotation):
highlight_annotation = cast(apdf.annotations.HighlightAnnotation, annotation)
print(highlight_annotation.get_marked_text())