import gradio as gr from pdf2docx import Converter from docx import Document from fpdf import FPDF import os import tempfile import pytesseract from pdf2image import convert_from_path from reportlab.lib.pagesizes import A4 from reportlab.pdfgen import canvas from io import BytesIO # Ensure Tesseract is installed and its path is set pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path to your Tesseract installation title_and_description = """ # PDF to Word and Word to PDF Converter This tool allows you to convert PDF files to Word documents and Word documents to PDF files. Note: Scanned PDFs (image-based PDFs) are supported using OCR. """ def pdf_to_word(pdf_file): """ Converts a PDF file to a Word document. Handles both text-based and image-based (scanned) PDFs. """ try: # Check if Poppler is installed try: pages = convert_from_path(pdf_file.name, 500) # Convert PDF to images except Exception as e: return f"Error: Unable to process PDF. Is Poppler installed and in PATH? ({e})" # Create a temporary directory to store intermediate files with tempfile.TemporaryDirectory() as temp_dir: docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx')) # If the PDF is image-based, use OCR to extract text doc = Document() for page in pages: text = pytesseract.image_to_string(page) # Extract text from the image doc.add_paragraph(text) # Add the extracted text to the Word document # Save the Word document doc.save(docx_filename) # Return the path to the converted file return docx_filename except Exception as e: return f"Error: {e}" def word_to_pdf(docx_file): """ Converts a Word document to a PDF file. Handles images carefully using reportlab. """ try: # Create a temporary directory to store the output file with tempfile.TemporaryDirectory() as temp_dir: pdf_filename = os.path.join(temp_dir, "output.pdf") # Create a PDF using reportlab packet = BytesIO() can = canvas.Canvas(packet, pagesize=A4) can.setFont("Helvetica", 12) # Read the Word document doc = Document(docx_file.name) y = 800 # Starting y-coordinate for text for para in doc.paragraphs: text = para.text.strip() if not text: continue # Add text to the PDF can.drawString(100, y, text) y -= 15 # Move down for the next line # Handle page breaks if y < 50: can.showPage() y = 800 # Save the PDF can.save() packet.seek(0) with open(pdf_filename, "wb") as f: f.write(packet.read()) # Return the path to the converted file return pdf_filename except Exception as e: return f"Error: {e}" with gr.Blocks() as app: gr.Markdown(title_and_description) with gr.Row(): with gr.Column(): with gr.Accordion("PDF to Word"): pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) convert_pdf_to_word = gr.Button("Convert to Word") word_output = gr.File(label="Download Word file", type="filepath", file_types=[".docx"]) convert_pdf_to_word.click(pdf_to_word, inputs=[pdf_input], outputs=[word_output]) with gr.Column(): with gr.Accordion("Word to PDF"): word_input = gr.File(label="Upload Word", file_types=[".docx"]) convert_word_to_pdf = gr.Button("Convert to PDF") pdf_output = gr.File(label="Download PDF file", type="filepath", file_types=[".pdf"]) convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output]) app.launch()