import gradio as gr import pytesseract from pdf2image import convert_from_path import tempfile import os import shutil def ocr_pdf(file_path): # Temporary directory for processing with tempfile.TemporaryDirectory() as temp_dir: # Convert PDF to images images = convert_from_path(file_path, output_folder=temp_dir) # Extract text from each page image extracted_text = "" for i, image in enumerate(images): text = pytesseract.image_to_string(image) extracted_text += f"\n{text}\n\n" # Save the extracted text to a .txt file in a persistent location output_txt_path = os.path.join(temp_dir, "extracted_text.txt") with open(output_txt_path, "w", encoding="utf-8") as f: f.write(extracted_text) # Create a persistent file to serve for download final_output_path = "/tmp/extracted_text.txt" shutil.copy(output_txt_path, final_output_path) # Copy to a persistent location # Return both: actual text and path (for download) return extracted_text, final_output_path # Gradio Interface iface = gr.Interface( fn=lambda file: ocr_pdf(file.name), # Pass file path instead of file object inputs=gr.File(label="Upload PDF File"), outputs=[ gr.Textbox(label="Extracted Text"), # Shows text directly gr.File(label="Download Extracted Text (.txt)") # Optional download ], title="PDF to Text OCR", allow_flagging="never" ) if __name__ == "__main__": iface.launch()