Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

PDF-to-TXT-OCR / app.py

Update app.py

788ec9b verified 25 days ago

1.6 kB

	import gradio as gr
	import pytesseract
	from pdf2image import convert_from_path
	import tempfile
	import os
	import shutil

	def ocr_pdf(file_path):
	# Temporary directory for processing
	with tempfile.TemporaryDirectory() as temp_dir:
	# Convert PDF to images
	images = convert_from_path(file_path, output_folder=temp_dir)

	# Extract text from each page image
	extracted_text = ""
	for i, image in enumerate(images):
	text = pytesseract.image_to_string(image)
	extracted_text += f"\n{text}\n\n"

	# Save the extracted text to a .txt file in a persistent location
	output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
	with open(output_txt_path, "w", encoding="utf-8") as f:
	f.write(extracted_text)

	# Create a persistent file to serve for download
	final_output_path = "/tmp/extracted_text.txt"
	shutil.copy(output_txt_path, final_output_path) # Copy to a persistent location

	# Return both: actual text and path (for download)
	return extracted_text, final_output_path

	# Gradio Interface
	iface = gr.Interface(
	fn=lambda file: ocr_pdf(file.name), # Pass file path instead of file object
	inputs=gr.File(label="Upload PDF File"),
	outputs=[
	gr.Textbox(label="Extracted Text"), # Shows text directly
	gr.File(label="Download Extracted Text (.txt)") # Optional download
	],
	title="PDF to Text OCR",
	allow_flagging="never"
	)

	if __name__ == "__main__":
	iface.launch()