drewThomasson commited on
Commit
51a779d
·
verified ·
1 Parent(s): fa22678

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -2
app.py CHANGED
@@ -3,8 +3,10 @@ import pytesseract
3
  from pdf2image import convert_from_path
4
  import tempfile
5
  import os
 
6
 
7
  def ocr_pdf(file_path):
 
8
  with tempfile.TemporaryDirectory() as temp_dir:
9
  # Convert PDF to images
10
  images = convert_from_path(file_path, output_folder=temp_dir)
@@ -15,12 +17,16 @@ def ocr_pdf(file_path):
15
  text = pytesseract.image_to_string(image)
16
  extracted_text += f"--- Page {i+1} ---\n{text}\n\n"
17
 
18
- # Save the extracted text to a .txt file
19
  output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
20
  with open(output_txt_path, "w") as f:
21
  f.write(extracted_text)
22
 
23
- return output_txt_path
 
 
 
 
24
 
25
  # Gradio Interface
26
  iface = gr.Interface(
 
3
  from pdf2image import convert_from_path
4
  import tempfile
5
  import os
6
+ import shutil
7
 
8
  def ocr_pdf(file_path):
9
+ # Temporary directory for processing
10
  with tempfile.TemporaryDirectory() as temp_dir:
11
  # Convert PDF to images
12
  images = convert_from_path(file_path, output_folder=temp_dir)
 
17
  text = pytesseract.image_to_string(image)
18
  extracted_text += f"--- Page {i+1} ---\n{text}\n\n"
19
 
20
+ # Save the extracted text to a .txt file in a persistent location
21
  output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
22
  with open(output_txt_path, "w") as f:
23
  f.write(extracted_text)
24
 
25
+ # Create a persistent file to serve for download
26
+ final_output_path = "/tmp/extracted_text.txt"
27
+ shutil.copy(output_txt_path, final_output_path) # Copy to a persistent location
28
+
29
+ return final_output_path
30
 
31
  # Gradio Interface
32
  iface = gr.Interface(