Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

File size: 1,120 Bytes

fda9a03

import gradio as gr
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os

def ocr_pdf(file_path):
    with tempfile.TemporaryDirectory() as temp_dir:
        # Convert PDF to images
        images = convert_from_path(file_path, output_folder=temp_dir)
        
        # Extract text from each page image
        extracted_text = ""
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            extracted_text += f"--- Page {i+1} ---\n{text}\n\n"
        
        # Save the extracted text to a .txt file
        output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
        with open(output_txt_path, "w") as f:
            f.write(extracted_text)
        
        return output_txt_path

# Gradio Interface
iface = gr.Interface(
    fn=lambda file: ocr_pdf(file.name),  # Pass file path instead of file object
    inputs=gr.File(label="Upload PDF File"), 
    outputs=gr.File(label="Download Extracted Text (.txt)"),  # Outputs a downloadable .txt file
    title="PDF to Text OCR"
)

if __name__ == "__main__":
    iface.launch()