Spaces:

Nymbo
/

data-boards

Sleeping

App Files Files Community

data-boards / app.py

prithivMLmods

Update app.py

b645cb6 verified 8 months ago

raw

history blame

4.23 kB

	import gradio as gr
	from pdf2docx import Converter
	from docx import Document
	from fpdf import FPDF
	import os
	import tempfile
	import pytesseract
	from pdf2image import convert_from_path
	from reportlab.lib.pagesizes import A4
	from reportlab.pdfgen import canvas
	from io import BytesIO

	# Ensure Tesseract is installed and its path is set
	pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path to your Tesseract installation

	title_and_description = """
	# PDF to Word and Word to PDF Converter

	This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
	Note: Scanned PDFs (image-based PDFs) are supported using OCR.
	"""

	def pdf_to_word(pdf_file):
	"""
	Converts a PDF file to a Word document.
	Handles both text-based and image-based (scanned) PDFs.
	"""
	try:
	# Check if Poppler is installed
	try:
	pages = convert_from_path(pdf_file.name, 500) # Convert PDF to images
	except Exception as e:
	return f"Error: Unable to process PDF. Is Poppler installed and in PATH? ({e})"

	# Create a temporary directory to store intermediate files
	with tempfile.TemporaryDirectory() as temp_dir:
	docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))

	# If the PDF is image-based, use OCR to extract text
	doc = Document()
	for page in pages:
	text = pytesseract.image_to_string(page) # Extract text from the image
	doc.add_paragraph(text) # Add the extracted text to the Word document

	# Save the Word document
	doc.save(docx_filename)

	# Return the path to the converted file
	return docx_filename
	except Exception as e:
	return f"Error: {e}"

	def word_to_pdf(docx_file):
	"""
	Converts a Word document to a PDF file.
	Handles images carefully using reportlab.
	"""
	try:
	# Create a temporary directory to store the output file
	with tempfile.TemporaryDirectory() as temp_dir:
	pdf_filename = os.path.join(temp_dir, "output.pdf")

	# Create a PDF using reportlab
	packet = BytesIO()
	can = canvas.Canvas(packet, pagesize=A4)
	can.setFont("Helvetica", 12)

	# Read the Word document
	doc = Document(docx_file.name)
	y = 800 # Starting y-coordinate for text

	for para in doc.paragraphs:
	text = para.text.strip()
	if not text:
	continue

	# Add text to the PDF
	can.drawString(100, y, text)
	y -= 15 # Move down for the next line

	# Handle page breaks
	if y < 50:
	can.showPage()
	y = 800

	# Save the PDF
	can.save()
	packet.seek(0)
	with open(pdf_filename, "wb") as f:
	f.write(packet.read())

	# Return the path to the converted file
	return pdf_filename
	except Exception as e:
	return f"Error: {e}"

	with gr.Blocks() as app:
	gr.Markdown(title_and_description)

	with gr.Row():
	with gr.Column():
	with gr.Accordion("PDF to Word"):
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	convert_pdf_to_word = gr.Button("Convert to Word")
	word_output = gr.File(label="Download Word file", type="filepath", file_types=[".docx"])

	convert_pdf_to_word.click(pdf_to_word, inputs=[pdf_input], outputs=[word_output])

	with gr.Column():
	with gr.Accordion("Word to PDF"):
	word_input = gr.File(label="Upload Word", file_types=[".docx"])
	convert_word_to_pdf = gr.Button("Convert to PDF")
	pdf_output = gr.File(label="Download PDF file", type="filepath", file_types=[".pdf"])

	convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output])

	app.launch()