Spaces:
Sleeping
Sleeping
Update extract_text_from_pdf.py
Browse files- extract_text_from_pdf.py +4 -3
extract_text_from_pdf.py
CHANGED
|
@@ -49,7 +49,7 @@ class PDFTextExtractor:
|
|
| 49 |
Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
|
| 50 |
Here is the text:
|
| 51 |
"""
|
| 52 |
-
|
| 53 |
def validate_pdf(self):
|
| 54 |
"""Check if the file exists and is a valid PDF."""
|
| 55 |
if not os.path.exists(self.pdf_path):
|
|
@@ -60,6 +60,7 @@ class PDFTextExtractor:
|
|
| 60 |
return False
|
| 61 |
return True
|
| 62 |
|
|
|
|
| 63 |
def extract_text(self):
|
| 64 |
"""Extract text from the PDF, limited by max_chars."""
|
| 65 |
if not self.validate_pdf():
|
|
@@ -90,7 +91,7 @@ class PDFTextExtractor:
|
|
| 90 |
final_text = '\n'.join(extracted_text)
|
| 91 |
print(f"Extraction complete! Total characters: {len(final_text)}")
|
| 92 |
return final_text
|
| 93 |
-
|
| 94 |
def create_word_bounded_chunks(self, text):
|
| 95 |
"""Split text into chunks around the target size."""
|
| 96 |
words = text.split()
|
|
@@ -129,7 +130,7 @@ class PDFTextExtractor:
|
|
| 129 |
|
| 130 |
processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
|
| 131 |
return processed_text
|
| 132 |
-
|
| 133 |
def clean_and_save_text(self):
|
| 134 |
"""Extract, clean, and save processed text to a file."""
|
| 135 |
extracted_text = self.extract_text()
|
|
|
|
| 49 |
Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
|
| 50 |
Here is the text:
|
| 51 |
"""
|
| 52 |
+
@spaces.GPU
|
| 53 |
def validate_pdf(self):
|
| 54 |
"""Check if the file exists and is a valid PDF."""
|
| 55 |
if not os.path.exists(self.pdf_path):
|
|
|
|
| 60 |
return False
|
| 61 |
return True
|
| 62 |
|
| 63 |
+
@spaces.GPU
|
| 64 |
def extract_text(self):
|
| 65 |
"""Extract text from the PDF, limited by max_chars."""
|
| 66 |
if not self.validate_pdf():
|
|
|
|
| 91 |
final_text = '\n'.join(extracted_text)
|
| 92 |
print(f"Extraction complete! Total characters: {len(final_text)}")
|
| 93 |
return final_text
|
| 94 |
+
@spaces.GPU
|
| 95 |
def create_word_bounded_chunks(self, text):
|
| 96 |
"""Split text into chunks around the target size."""
|
| 97 |
words = text.split()
|
|
|
|
| 130 |
|
| 131 |
processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
|
| 132 |
return processed_text
|
| 133 |
+
@spaces.GPU
|
| 134 |
def clean_and_save_text(self):
|
| 135 |
"""Extract, clean, and save processed text to a file."""
|
| 136 |
extracted_text = self.extract_text()
|