Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,45 +4,37 @@ from docx import Document
|
|
4 |
from fpdf import FPDF
|
5 |
import os
|
6 |
import tempfile
|
7 |
-
import
|
8 |
-
from pdf2image import convert_from_path
|
9 |
from reportlab.lib.pagesizes import A4
|
10 |
from reportlab.pdfgen import canvas
|
11 |
from io import BytesIO
|
12 |
|
13 |
-
# Ensure Tesseract is installed and its path is set
|
14 |
-
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path to your Tesseract installation
|
15 |
-
|
16 |
title_and_description = """
|
17 |
# PDF to Word and Word to PDF Converter
|
18 |
|
19 |
This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
|
20 |
-
Note: Scanned PDFs (image-based PDFs) are supported
|
21 |
"""
|
22 |
|
23 |
def pdf_to_word(pdf_file):
|
24 |
"""
|
25 |
-
Converts a PDF file to a Word document.
|
26 |
-
|
27 |
"""
|
28 |
try:
|
29 |
-
#
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
return f"Error: Unable to process PDF. Is Poppler installed and in PATH? ({e})"
|
34 |
|
35 |
-
# Create a temporary directory to store
|
36 |
with tempfile.TemporaryDirectory() as temp_dir:
|
37 |
docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
|
38 |
|
39 |
-
#
|
40 |
doc = Document()
|
41 |
-
for
|
42 |
-
|
43 |
-
doc.add_paragraph(text) # Add the extracted text to the Word document
|
44 |
-
|
45 |
-
# Save the Word document
|
46 |
doc.save(docx_filename)
|
47 |
|
48 |
# Return the path to the converted file
|
@@ -53,7 +45,7 @@ def pdf_to_word(pdf_file):
|
|
53 |
def word_to_pdf(docx_file):
|
54 |
"""
|
55 |
Converts a Word document to a PDF file.
|
56 |
-
Handles
|
57 |
"""
|
58 |
try:
|
59 |
# Create a temporary directory to store the output file
|
@@ -114,4 +106,4 @@ with gr.Blocks() as app:
|
|
114 |
|
115 |
convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output])
|
116 |
|
117 |
-
app.launch()
|
|
|
4 |
from fpdf import FPDF
|
5 |
import os
|
6 |
import tempfile
|
7 |
+
from pdfminer.high_level import extract_text
|
|
|
8 |
from reportlab.lib.pagesizes import A4
|
9 |
from reportlab.pdfgen import canvas
|
10 |
from io import BytesIO
|
11 |
|
|
|
|
|
|
|
12 |
title_and_description = """
|
13 |
# PDF to Word and Word to PDF Converter
|
14 |
|
15 |
This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
|
16 |
+
Note: Scanned PDFs (image-based PDFs) are not supported.
|
17 |
"""
|
18 |
|
19 |
def pdf_to_word(pdf_file):
|
20 |
"""
|
21 |
+
Converts a text-based PDF file to a Word document.
|
22 |
+
Scanned PDFs (image-based PDFs) are not supported.
|
23 |
"""
|
24 |
try:
|
25 |
+
# Extract text from the PDF using pdfminer
|
26 |
+
text = extract_text(pdf_file.name)
|
27 |
+
if not text.strip():
|
28 |
+
return "Error: The PDF appears to be image-based (scanned). Scanned PDFs are not supported."
|
|
|
29 |
|
30 |
+
# Create a temporary directory to store the output file
|
31 |
with tempfile.TemporaryDirectory() as temp_dir:
|
32 |
docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
|
33 |
|
34 |
+
# Create a Word document and add the extracted text
|
35 |
doc = Document()
|
36 |
+
for line in text.splitlines():
|
37 |
+
doc.add_paragraph(line)
|
|
|
|
|
|
|
38 |
doc.save(docx_filename)
|
39 |
|
40 |
# Return the path to the converted file
|
|
|
45 |
def word_to_pdf(docx_file):
|
46 |
"""
|
47 |
Converts a Word document to a PDF file.
|
48 |
+
Handles text and basic formatting.
|
49 |
"""
|
50 |
try:
|
51 |
# Create a temporary directory to store the output file
|
|
|
106 |
|
107 |
convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output])
|
108 |
|
109 |
+
app.launch(share=True)
|