Spaces:

yasserrmd
/

NotebookLlama

Running

yasserrmd commited on Oct 30, 2024

Commit

ea4f634

verified ·

1 Parent(s): 65a0de7

Update extract_text_from_pdf.py

Files changed (1) hide show

extract_text_from_pdf.py CHANGED Viewed

@@ -49,7 +49,7 @@ class PDFTextExtractor:
         Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
         Here is the text:
         """
     def validate_pdf(self):
         """Check if the file exists and is a valid PDF."""
         if not os.path.exists(self.pdf_path):
@@ -60,6 +60,7 @@ class PDFTextExtractor:
             return False
         return True
     def extract_text(self):
         """Extract text from the PDF, limited by max_chars."""
         if not self.validate_pdf():
@@ -90,7 +91,7 @@ class PDFTextExtractor:
             final_text = '\n'.join(extracted_text)
             print(f"Extraction complete! Total characters: {len(final_text)}")
             return final_text
     def create_word_bounded_chunks(self, text):
         """Split text into chunks around the target size."""
         words = text.split()
@@ -129,7 +130,7 @@ class PDFTextExtractor:
         processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
         return processed_text
     def clean_and_save_text(self):
         """Extract, clean, and save processed text to a file."""
         extracted_text = self.extract_text()

         Be smart and aggressive with removing details; you're only cleaning up the text without summarizing.
         Here is the text:
         """
+    @spaces.GPU
     def validate_pdf(self):
         """Check if the file exists and is a valid PDF."""
         if not os.path.exists(self.pdf_path):
             return False
         return True
+    @spaces.GPU
     def extract_text(self):
         """Extract text from the PDF, limited by max_chars."""
         if not self.validate_pdf():
             final_text = '\n'.join(extracted_text)
             print(f"Extraction complete! Total characters: {len(final_text)}")
             return final_text
+    @spaces.GPU
     def create_word_bounded_chunks(self, text):
         """Split text into chunks around the target size."""
         words = text.split()
         processed_text = self.tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
         return processed_text
+    @spaces.GPU
     def clean_and_save_text(self):
         """Extract, clean, and save processed text to a file."""
         extracted_text = self.extract_text()