Spaces:
Running
Running
import gradio as gr | |
import pytesseract | |
from pdf2image import convert_from_path | |
import tempfile | |
import os | |
import shutil | |
def ocr_pdf(file_path): | |
# Temporary directory for processing | |
with tempfile.TemporaryDirectory() as temp_dir: | |
# Convert PDF to images | |
images = convert_from_path(file_path, output_folder=temp_dir) | |
# Extract text from each page image | |
extracted_text = "" | |
for i, image in enumerate(images): | |
text = pytesseract.image_to_string(image) | |
extracted_text += f"\n{text}\n\n" | |
# Save the extracted text to a .txt file in a persistent location | |
output_txt_path = os.path.join(temp_dir, "extracted_text.txt") | |
with open(output_txt_path, "w", encoding="utf-8") as f: | |
f.write(extracted_text) | |
# Create a persistent file to serve for download | |
final_output_path = "/tmp/extracted_text.txt" | |
shutil.copy(output_txt_path, final_output_path) # Copy to a persistent location | |
# Return both: actual text and path (for download) | |
return extracted_text, final_output_path | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=lambda file: ocr_pdf(file.name), # Pass file path instead of file object | |
inputs=gr.File(label="Upload PDF File"), | |
outputs=[ | |
gr.Textbox(label="Extracted Text"), # Shows text directly | |
gr.File(label="Download Extracted Text (.txt)") # Optional download | |
], | |
title="PDF to Text OCR", | |
allow_flagging="never" | |
) | |
if __name__ == "__main__": | |
iface.launch() | |