Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,33 +4,45 @@ from docx import Document
|
|
4 |
from fpdf import FPDF
|
5 |
import os
|
6 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
title_and_description = """
|
9 |
# PDF to Word and Word to PDF Converter
|
10 |
|
11 |
This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
|
12 |
-
Note: Scanned PDFs (image-based PDFs) are
|
13 |
"""
|
14 |
|
15 |
def pdf_to_word(pdf_file):
|
16 |
"""
|
17 |
Converts a PDF file to a Word document.
|
|
|
18 |
"""
|
19 |
try:
|
20 |
-
#
|
21 |
-
with open(pdf_file.name, 'rb') as f:
|
22 |
-
first_page = f.read(1024) # Read the first 1024 bytes of the PDF
|
23 |
-
if b"/Image" in first_page or b"/XObject" in first_page:
|
24 |
-
return "Error: Scanned PDFs (image-based PDFs) are not supported."
|
25 |
-
|
26 |
-
# Create a temporary directory to store the output file
|
27 |
with tempfile.TemporaryDirectory() as temp_dir:
|
28 |
docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
# Return the path to the converted file
|
36 |
return docx_filename
|
@@ -40,38 +52,42 @@ def pdf_to_word(pdf_file):
|
|
40 |
def word_to_pdf(docx_file):
|
41 |
"""
|
42 |
Converts a Word document to a PDF file.
|
|
|
43 |
"""
|
44 |
try:
|
45 |
# Create a temporary directory to store the output file
|
46 |
with tempfile.TemporaryDirectory() as temp_dir:
|
47 |
pdf_filename = os.path.join(temp_dir, "output.pdf")
|
48 |
|
49 |
-
#
|
|
|
|
|
|
|
|
|
|
|
50 |
doc = Document(docx_file.name)
|
51 |
-
|
52 |
-
pdf.set_auto_page_break(auto=True, margin=15)
|
53 |
-
pdf.add_page()
|
54 |
-
pdf.add_font('Arial', '', 'Arial.ttf', uni=True)
|
55 |
-
pdf.set_font('Arial', size=12)
|
56 |
|
57 |
for para in doc.paragraphs:
|
58 |
text = para.text.strip()
|
59 |
if not text:
|
60 |
continue
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
for
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
72 |
|
73 |
-
pdf.output(pdf_filename)
|
74 |
-
|
75 |
# Return the path to the converted file
|
76 |
return pdf_filename
|
77 |
except Exception as e:
|
|
|
4 |
from fpdf import FPDF
|
5 |
import os
|
6 |
import tempfile
|
7 |
+
import pytesseract
|
8 |
+
from pdf2image import convert_from_path
|
9 |
+
from reportlab.lib.pagesizes import A4
|
10 |
+
from reportlab.pdfgen import canvas
|
11 |
+
from io import BytesIO
|
12 |
+
|
13 |
+
# Ensure Tesseract is installed and its path is set
|
14 |
+
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path to your Tesseract installation
|
15 |
|
16 |
title_and_description = """
|
17 |
# PDF to Word and Word to PDF Converter
|
18 |
|
19 |
This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
|
20 |
+
Note: Scanned PDFs (image-based PDFs) are supported using OCR.
|
21 |
"""
|
22 |
|
23 |
def pdf_to_word(pdf_file):
|
24 |
"""
|
25 |
Converts a PDF file to a Word document.
|
26 |
+
Handles both text-based and image-based (scanned) PDFs.
|
27 |
"""
|
28 |
try:
|
29 |
+
# Create a temporary directory to store intermediate files
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
with tempfile.TemporaryDirectory() as temp_dir:
|
31 |
docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
|
32 |
|
33 |
+
# Check if the PDF is text-based or image-based
|
34 |
+
pages = convert_from_path(pdf_file.name, 500) # Convert PDF to images
|
35 |
+
if len(pages) == 0:
|
36 |
+
return "Error: Unable to process the PDF file."
|
37 |
+
|
38 |
+
# If the PDF is image-based, use OCR to extract text
|
39 |
+
doc = Document()
|
40 |
+
for page in pages:
|
41 |
+
text = pytesseract.image_to_string(page) # Extract text from the image
|
42 |
+
doc.add_paragraph(text) # Add the extracted text to the Word document
|
43 |
+
|
44 |
+
# Save the Word document
|
45 |
+
doc.save(docx_filename)
|
46 |
|
47 |
# Return the path to the converted file
|
48 |
return docx_filename
|
|
|
52 |
def word_to_pdf(docx_file):
|
53 |
"""
|
54 |
Converts a Word document to a PDF file.
|
55 |
+
Handles images carefully using reportlab.
|
56 |
"""
|
57 |
try:
|
58 |
# Create a temporary directory to store the output file
|
59 |
with tempfile.TemporaryDirectory() as temp_dir:
|
60 |
pdf_filename = os.path.join(temp_dir, "output.pdf")
|
61 |
|
62 |
+
# Create a PDF using reportlab
|
63 |
+
packet = BytesIO()
|
64 |
+
can = canvas.Canvas(packet, pagesize=A4)
|
65 |
+
can.setFont("Helvetica", 12)
|
66 |
+
|
67 |
+
# Read the Word document
|
68 |
doc = Document(docx_file.name)
|
69 |
+
y = 800 # Starting y-coordinate for text
|
|
|
|
|
|
|
|
|
70 |
|
71 |
for para in doc.paragraphs:
|
72 |
text = para.text.strip()
|
73 |
if not text:
|
74 |
continue
|
75 |
|
76 |
+
# Add text to the PDF
|
77 |
+
can.drawString(100, y, text)
|
78 |
+
y -= 15 # Move down for the next line
|
79 |
+
|
80 |
+
# Handle page breaks
|
81 |
+
if y < 50:
|
82 |
+
can.showPage()
|
83 |
+
y = 800
|
84 |
+
|
85 |
+
# Save the PDF
|
86 |
+
can.save()
|
87 |
+
packet.seek(0)
|
88 |
+
with open(pdf_filename, "wb") as f:
|
89 |
+
f.write(packet.read())
|
90 |
|
|
|
|
|
91 |
# Return the path to the converted file
|
92 |
return pdf_filename
|
93 |
except Exception as e:
|