Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

drewThomasson commited on Oct 9, 2024

Commit

51a779d

verified ·

1 Parent(s): fa22678

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,8 +3,10 @@ import pytesseract
 from pdf2image import convert_from_path
 import tempfile
 import os
 def ocr_pdf(file_path):
     with tempfile.TemporaryDirectory() as temp_dir:
         # Convert PDF to images
         images = convert_from_path(file_path, output_folder=temp_dir)
@@ -15,12 +17,16 @@ def ocr_pdf(file_path):
             text = pytesseract.image_to_string(image)
             extracted_text += f"--- Page {i+1} ---\n{text}\n\n"
-        # Save the extracted text to a .txt file
         output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
         with open(output_txt_path, "w") as f:
             f.write(extracted_text)
-        return output_txt_path
 # Gradio Interface
 iface = gr.Interface(

 from pdf2image import convert_from_path
 import tempfile
 import os
+import shutil
 def ocr_pdf(file_path):
+    # Temporary directory for processing
     with tempfile.TemporaryDirectory() as temp_dir:
         # Convert PDF to images
         images = convert_from_path(file_path, output_folder=temp_dir)
             text = pytesseract.image_to_string(image)
             extracted_text += f"--- Page {i+1} ---\n{text}\n\n"
+        # Save the extracted text to a .txt file in a persistent location
         output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
         with open(output_txt_path, "w") as f:
             f.write(extracted_text)
+        # Create a persistent file to serve for download
+        final_output_path = "/tmp/extracted_text.txt"
+        shutil.copy(output_txt_path, final_output_path)  # Copy to a persistent location
+        return final_output_path
 # Gradio Interface
 iface = gr.Interface(