Spaces:

Nymbo
/

data-boards

Sleeping

App Files Files Community

prithivMLmods commited on Dec 20, 2024

Commit

61cdb02

verified ·

1 Parent(s): a5f8478

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -30

app.py CHANGED Viewed

@@ -4,33 +4,45 @@ from docx import Document
 from fpdf import FPDF
 import os
 import tempfile
 title_and_description = """
 # PDF to Word and Word to PDF Converter
 This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
-Note: Scanned PDFs (image-based PDFs) are not supported.
 """
 def pdf_to_word(pdf_file):
     """
     Converts a PDF file to a Word document.
     """
     try:
-        # Check if the PDF is scanned (image-based)
-        with open(pdf_file.name, 'rb') as f:
-            first_page = f.read(1024)  # Read the first 1024 bytes of the PDF
-            if b"/Image" in first_page or b"/XObject" in first_page:
-                return "Error: Scanned PDFs (image-based PDFs) are not supported."
-        # Create a temporary directory to store the output file
         with tempfile.TemporaryDirectory() as temp_dir:
             docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
-            # Convert PDF to Word
-            cv = Converter(pdf_file.name)
-            cv.convert(docx_filename, multi_processing=True, start=0, end=None)
-            cv.close()
             # Return the path to the converted file
             return docx_filename
@@ -40,38 +52,42 @@ def pdf_to_word(pdf_file):
 def word_to_pdf(docx_file):
     """
     Converts a Word document to a PDF file.
     """
     try:
         # Create a temporary directory to store the output file
         with tempfile.TemporaryDirectory() as temp_dir:
             pdf_filename = os.path.join(temp_dir, "output.pdf")
-            # Convert Word to PDF
             doc = Document(docx_file.name)
-            pdf = FPDF(format='A4')
-            pdf.set_auto_page_break(auto=True, margin=15)
-            pdf.add_page()
-            pdf.add_font('Arial', '', 'Arial.ttf', uni=True)
-            pdf.set_font('Arial', size=12)
             for para in doc.paragraphs:
                 text = para.text.strip()
                 if not text:
                     continue
-                words = text.split()
-                line = ''
-                for word in words:
-                    if pdf.get_string_width(line + word) < (pdf.w - 2 * pdf.l_margin):
-                        line += word + ' '
-                    else:
-                        pdf.cell(0, 10, line, ln=True)
-                        line = word + ' '
-                if line:
-                    pdf.cell(0, 10, line, ln=True)
-            pdf.output(pdf_filename)
             # Return the path to the converted file
             return pdf_filename
     except Exception as e:

 from fpdf import FPDF
 import os
 import tempfile
+import pytesseract
+from pdf2image import convert_from_path
+from reportlab.lib.pagesizes import A4
+from reportlab.pdfgen import canvas
+from io import BytesIO
+# Ensure Tesseract is installed and its path is set
+pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update this path to your Tesseract installation
 title_and_description = """
 # PDF to Word and Word to PDF Converter
 This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
+Note: Scanned PDFs (image-based PDFs) are supported using OCR.
 """
 def pdf_to_word(pdf_file):
     """
     Converts a PDF file to a Word document.
+    Handles both text-based and image-based (scanned) PDFs.
     """
     try:
+        # Create a temporary directory to store intermediate files
         with tempfile.TemporaryDirectory() as temp_dir:
             docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
+            # Check if the PDF is text-based or image-based
+            pages = convert_from_path(pdf_file.name, 500)  # Convert PDF to images
+            if len(pages) == 0:
+                return "Error: Unable to process the PDF file."
+            # If the PDF is image-based, use OCR to extract text
+            doc = Document()
+            for page in pages:
+                text = pytesseract.image_to_string(page)  # Extract text from the image
+                doc.add_paragraph(text)  # Add the extracted text to the Word document
+            # Save the Word document
+            doc.save(docx_filename)
             # Return the path to the converted file
             return docx_filename
 def word_to_pdf(docx_file):
     """
     Converts a Word document to a PDF file.
+    Handles images carefully using reportlab.
     """
     try:
         # Create a temporary directory to store the output file
         with tempfile.TemporaryDirectory() as temp_dir:
             pdf_filename = os.path.join(temp_dir, "output.pdf")
+            # Create a PDF using reportlab
+            packet = BytesIO()
+            can = canvas.Canvas(packet, pagesize=A4)
+            can.setFont("Helvetica", 12)
+            # Read the Word document
             doc = Document(docx_file.name)
+            y = 800  # Starting y-coordinate for text
             for para in doc.paragraphs:
                 text = para.text.strip()
                 if not text:
                     continue
+                # Add text to the PDF
+                can.drawString(100, y, text)
+                y -= 15  # Move down for the next line
+                # Handle page breaks
+                if y < 50:
+                    can.showPage()
+                    y = 800
+            # Save the PDF
+            can.save()
+            packet.seek(0)
+            with open(pdf_filename, "wb") as f:
+                f.write(packet.read())
             # Return the path to the converted file
             return pdf_filename
     except Exception as e: