Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

File size: 1,596 Bytes

fda9a03
 
 
 
 
51a779d
fda9a03
 
51a779d
fda9a03
 
 
 
 
 
 
 
225af8d
fda9a03
51a779d
fda9a03
788ec9b
fda9a03
 
51a779d
 
 
 
788ec9b
 
fda9a03
 
 
 
 
788ec9b
 
 
 
 
 
fda9a03

import gradio as gr
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os
import shutil

def ocr_pdf(file_path):
    # Temporary directory for processing
    with tempfile.TemporaryDirectory() as temp_dir:
        # Convert PDF to images
        images = convert_from_path(file_path, output_folder=temp_dir)
        
        # Extract text from each page image
        extracted_text = ""
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            extracted_text += f"\n{text}\n\n"
        
        # Save the extracted text to a .txt file in a persistent location
        output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
        with open(output_txt_path, "w", encoding="utf-8") as f:
            f.write(extracted_text)
        
        # Create a persistent file to serve for download
        final_output_path = "/tmp/extracted_text.txt"
        shutil.copy(output_txt_path, final_output_path)  # Copy to a persistent location
        
        # Return both: actual text and path (for download)
        return extracted_text, final_output_path

# Gradio Interface
iface = gr.Interface(
    fn=lambda file: ocr_pdf(file.name),  # Pass file path instead of file object
    inputs=gr.File(label="Upload PDF File"), 
    outputs=[
        gr.Textbox(label="Extracted Text"),               # Shows text directly
        gr.File(label="Download Extracted Text (.txt)")   # Optional download
    ],
    title="PDF to Text OCR",
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()