Spaces:
Sleeping
Sleeping
File size: 3,842 Bytes
81dba5a a1eb9dc 81dba5a a9bbdc9 5556171 61cdb02 a1eb9dc a9bbdc9 5556171 a1eb9dc 81dba5a a1eb9dc a9bbdc9 5556171 a9bbdc9 5556171 b645cb6 5556171 a9bbdc9 5556171 61cdb02 5556171 61cdb02 a9bbdc9 81dba5a a1eb9dc a9bbdc9 5556171 a9bbdc9 61cdb02 a9bbdc9 61cdb02 81dba5a a9bbdc9 8be381c 61cdb02 81dba5a a9bbdc9 81dba5a b7e2217 a1eb9dc a9bbdc9 a1eb9dc a9bbdc9 a1eb9dc a9bbdc9 a1eb9dc a9bbdc9 a1eb9dc 81dba5a 5556171 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
from pdf2docx import Converter
from docx import Document
from fpdf import FPDF
import os
import tempfile
from pdfminer.high_level import extract_text
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from io import BytesIO
title_and_description = """
# PDF to Word and Word to PDF Converter
This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
Note: Scanned PDFs (image-based PDFs) are not supported.
"""
def pdf_to_word(pdf_file):
"""
Converts a text-based PDF file to a Word document.
Scanned PDFs (image-based PDFs) are not supported.
"""
try:
# Extract text from the PDF using pdfminer
text = extract_text(pdf_file.name)
if not text.strip():
return "Error: The PDF appears to be image-based (scanned). Scanned PDFs are not supported."
# Create a temporary directory to store the output file
with tempfile.TemporaryDirectory() as temp_dir:
docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
# Create a Word document and add the extracted text
doc = Document()
for line in text.splitlines():
doc.add_paragraph(line)
doc.save(docx_filename)
# Return the path to the converted file
return docx_filename
except Exception as e:
return f"Error: {e}"
def word_to_pdf(docx_file):
"""
Converts a Word document to a PDF file.
Handles text and basic formatting.
"""
try:
# Create a temporary directory to store the output file
with tempfile.TemporaryDirectory() as temp_dir:
pdf_filename = os.path.join(temp_dir, "output.pdf")
# Create a PDF using reportlab
packet = BytesIO()
can = canvas.Canvas(packet, pagesize=A4)
can.setFont("Helvetica", 12)
# Read the Word document
doc = Document(docx_file.name)
y = 800 # Starting y-coordinate for text
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
# Add text to the PDF
can.drawString(100, y, text)
y -= 15 # Move down for the next line
# Handle page breaks
if y < 50:
can.showPage()
y = 800
# Save the PDF
can.save()
packet.seek(0)
with open(pdf_filename, "wb") as f:
f.write(packet.read())
# Return the path to the converted file
return pdf_filename
except Exception as e:
return f"Error: {e}"
with gr.Blocks() as app:
gr.Markdown(title_and_description)
with gr.Row():
with gr.Column():
with gr.Accordion("PDF to Word"):
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
convert_pdf_to_word = gr.Button("Convert to Word")
word_output = gr.File(label="Download Word file", type="filepath", file_types=[".docx"])
convert_pdf_to_word.click(pdf_to_word, inputs=[pdf_input], outputs=[word_output])
with gr.Column():
with gr.Accordion("Word to PDF"):
word_input = gr.File(label="Upload Word", file_types=[".docx"])
convert_word_to_pdf = gr.Button("Convert to PDF")
pdf_output = gr.File(label="Download PDF file", type="filepath", file_types=[".pdf"])
convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output])
app.launch(share=True) |