Spaces:

Nymbo
/

data-boards

Sleeping

File size: 3,842 Bytes

81dba5a
a1eb9dc
 
 
81dba5a
a9bbdc9
5556171
61cdb02
 
 
 
a1eb9dc
a9bbdc9
 
 
5556171
a1eb9dc
81dba5a
a1eb9dc
a9bbdc9
5556171
 
a9bbdc9
 
5556171
 
 
 
b645cb6
5556171
a9bbdc9
 
 
5556171
61cdb02
5556171
 
61cdb02
a9bbdc9
 
 
 
 
81dba5a
a1eb9dc
a9bbdc9
 
5556171
a9bbdc9
 
 
 
 
 
61cdb02
 
 
 
 
 
a9bbdc9
61cdb02
81dba5a
a9bbdc9
 
 
 
8be381c
61cdb02
 
 
 
 
 
 
 
 
 
 
 
 
 
81dba5a
a9bbdc9
 
 
 
81dba5a
b7e2217
a1eb9dc
 
 
 
 
a9bbdc9
a1eb9dc
a9bbdc9
a1eb9dc
 
 
 
 
a9bbdc9
a1eb9dc
a9bbdc9
a1eb9dc
 
81dba5a
5556171

import gradio as gr
from pdf2docx import Converter
from docx import Document
from fpdf import FPDF
import os
import tempfile
from pdfminer.high_level import extract_text
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from io import BytesIO

title_and_description = """
# PDF to Word and Word to PDF Converter

This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
Note: Scanned PDFs (image-based PDFs) are not supported.
"""

def pdf_to_word(pdf_file):
    """
    Converts a text-based PDF file to a Word document.
    Scanned PDFs (image-based PDFs) are not supported.
    """
    try:
        # Extract text from the PDF using pdfminer
        text = extract_text(pdf_file.name)
        if not text.strip():
            return "Error: The PDF appears to be image-based (scanned). Scanned PDFs are not supported."

        # Create a temporary directory to store the output file
        with tempfile.TemporaryDirectory() as temp_dir:
            docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
            
            # Create a Word document and add the extracted text
            doc = Document()
            for line in text.splitlines():
                doc.add_paragraph(line)
            doc.save(docx_filename)
            
            # Return the path to the converted file
            return docx_filename
    except Exception as e:
        return f"Error: {e}"

def word_to_pdf(docx_file):
    """
    Converts a Word document to a PDF file.
    Handles text and basic formatting.
    """
    try:
        # Create a temporary directory to store the output file
        with tempfile.TemporaryDirectory() as temp_dir:
            pdf_filename = os.path.join(temp_dir, "output.pdf")
            
            # Create a PDF using reportlab
            packet = BytesIO()
            can = canvas.Canvas(packet, pagesize=A4)
            can.setFont("Helvetica", 12)

            # Read the Word document
            doc = Document(docx_file.name)
            y = 800  # Starting y-coordinate for text

            for para in doc.paragraphs:
                text = para.text.strip()
                if not text:
                    continue

                # Add text to the PDF
                can.drawString(100, y, text)
                y -= 15  # Move down for the next line

                # Handle page breaks
                if y < 50:
                    can.showPage()
                    y = 800

            # Save the PDF
            can.save()
            packet.seek(0)
            with open(pdf_filename, "wb") as f:
                f.write(packet.read())

            # Return the path to the converted file
            return pdf_filename
    except Exception as e:
        return f"Error: {e}"

with gr.Blocks() as app:
    gr.Markdown(title_and_description)
    
    with gr.Row():
        with gr.Column():
            with gr.Accordion("PDF to Word"):
                pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
                convert_pdf_to_word = gr.Button("Convert to Word")
                word_output = gr.File(label="Download Word file", type="filepath", file_types=[".docx"])
                
                convert_pdf_to_word.click(pdf_to_word, inputs=[pdf_input], outputs=[word_output])
                
        with gr.Column():
            with gr.Accordion("Word to PDF"):
                word_input = gr.File(label="Upload Word", file_types=[".docx"])
                convert_word_to_pdf = gr.Button("Convert to PDF")
                pdf_output = gr.File(label="Download PDF file", type="filepath", file_types=[".pdf"])
                
                convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output])

app.launch(share=True)