PDF-to-TXT-OCR / app.py
Habibahmadgillani's picture
Update app.py
788ec9b verified
raw
history blame
1.6 kB
import gradio as gr
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os
import shutil
def ocr_pdf(file_path):
# Temporary directory for processing
with tempfile.TemporaryDirectory() as temp_dir:
# Convert PDF to images
images = convert_from_path(file_path, output_folder=temp_dir)
# Extract text from each page image
extracted_text = ""
for i, image in enumerate(images):
text = pytesseract.image_to_string(image)
extracted_text += f"\n{text}\n\n"
# Save the extracted text to a .txt file in a persistent location
output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
with open(output_txt_path, "w", encoding="utf-8") as f:
f.write(extracted_text)
# Create a persistent file to serve for download
final_output_path = "/tmp/extracted_text.txt"
shutil.copy(output_txt_path, final_output_path) # Copy to a persistent location
# Return both: actual text and path (for download)
return extracted_text, final_output_path
# Gradio Interface
iface = gr.Interface(
fn=lambda file: ocr_pdf(file.name), # Pass file path instead of file object
inputs=gr.File(label="Upload PDF File"),
outputs=[
gr.Textbox(label="Extracted Text"), # Shows text directly
gr.File(label="Download Extracted Text (.txt)") # Optional download
],
title="PDF to Text OCR",
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()