Spaces:

Nymbo
/

data-boards

Sleeping

App Files Files Community

prithivMLmods commited on Dec 20, 2024

Commit

5556171

verified ·

1 Parent(s): cf4a4da

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -22

app.py CHANGED Viewed

@@ -4,45 +4,37 @@ from docx import Document
 from fpdf import FPDF
 import os
 import tempfile
-import pytesseract
-from pdf2image import convert_from_path
 from reportlab.lib.pagesizes import A4
 from reportlab.pdfgen import canvas
 from io import BytesIO
-# Ensure Tesseract is installed and its path is set
-pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update this path to your Tesseract installation
 title_and_description = """
 # PDF to Word and Word to PDF Converter
 This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
-Note: Scanned PDFs (image-based PDFs) are supported using OCR.
 """
 def pdf_to_word(pdf_file):
     """
-    Converts a PDF file to a Word document.
-    Handles both text-based and image-based (scanned) PDFs.
     """
     try:
-        # Check if Poppler is installed
-        try:
-            pages = convert_from_path(pdf_file.name, 500)  # Convert PDF to images
-        except Exception as e:
-            return f"Error: Unable to process PDF. Is Poppler installed and in PATH? ({e})"
-        # Create a temporary directory to store intermediate files
         with tempfile.TemporaryDirectory() as temp_dir:
             docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
-            # If the PDF is image-based, use OCR to extract text
             doc = Document()
-            for page in pages:
-                text = pytesseract.image_to_string(page)  # Extract text from the image
-                doc.add_paragraph(text)  # Add the extracted text to the Word document
-            # Save the Word document
             doc.save(docx_filename)
             # Return the path to the converted file
@@ -53,7 +45,7 @@ def pdf_to_word(pdf_file):
 def word_to_pdf(docx_file):
     """
     Converts a Word document to a PDF file.
-    Handles images carefully using reportlab.
     """
     try:
         # Create a temporary directory to store the output file
@@ -114,4 +106,4 @@ with gr.Blocks() as app:
                 convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output])
-app.launch()

 from fpdf import FPDF
 import os
 import tempfile
+from pdfminer.high_level import extract_text
 from reportlab.lib.pagesizes import A4
 from reportlab.pdfgen import canvas
 from io import BytesIO
 title_and_description = """
 # PDF to Word and Word to PDF Converter
 This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
+Note: Scanned PDFs (image-based PDFs) are not supported.
 """
 def pdf_to_word(pdf_file):
     """
+    Converts a text-based PDF file to a Word document.
+    Scanned PDFs (image-based PDFs) are not supported.
     """
     try:
+        # Extract text from the PDF using pdfminer
+        text = extract_text(pdf_file.name)
+        if not text.strip():
+            return "Error: The PDF appears to be image-based (scanned). Scanned PDFs are not supported."
+        # Create a temporary directory to store the output file
         with tempfile.TemporaryDirectory() as temp_dir:
             docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
+            # Create a Word document and add the extracted text
             doc = Document()
+            for line in text.splitlines():
+                doc.add_paragraph(line)
             doc.save(docx_filename)
             # Return the path to the converted file
 def word_to_pdf(docx_file):
     """
     Converts a Word document to a PDF file.
+    Handles text and basic formatting.
     """
     try:
         # Create a temporary directory to store the output file
                 convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output])
+app.launch(share=True)