prithivMLmods commited on
Commit
5556171
·
verified ·
1 Parent(s): cf4a4da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -22
app.py CHANGED
@@ -4,45 +4,37 @@ from docx import Document
4
  from fpdf import FPDF
5
  import os
6
  import tempfile
7
- import pytesseract
8
- from pdf2image import convert_from_path
9
  from reportlab.lib.pagesizes import A4
10
  from reportlab.pdfgen import canvas
11
  from io import BytesIO
12
 
13
- # Ensure Tesseract is installed and its path is set
14
- pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path to your Tesseract installation
15
-
16
  title_and_description = """
17
  # PDF to Word and Word to PDF Converter
18
 
19
  This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
20
- Note: Scanned PDFs (image-based PDFs) are supported using OCR.
21
  """
22
 
23
  def pdf_to_word(pdf_file):
24
  """
25
- Converts a PDF file to a Word document.
26
- Handles both text-based and image-based (scanned) PDFs.
27
  """
28
  try:
29
- # Check if Poppler is installed
30
- try:
31
- pages = convert_from_path(pdf_file.name, 500) # Convert PDF to images
32
- except Exception as e:
33
- return f"Error: Unable to process PDF. Is Poppler installed and in PATH? ({e})"
34
 
35
- # Create a temporary directory to store intermediate files
36
  with tempfile.TemporaryDirectory() as temp_dir:
37
  docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
38
 
39
- # If the PDF is image-based, use OCR to extract text
40
  doc = Document()
41
- for page in pages:
42
- text = pytesseract.image_to_string(page) # Extract text from the image
43
- doc.add_paragraph(text) # Add the extracted text to the Word document
44
-
45
- # Save the Word document
46
  doc.save(docx_filename)
47
 
48
  # Return the path to the converted file
@@ -53,7 +45,7 @@ def pdf_to_word(pdf_file):
53
  def word_to_pdf(docx_file):
54
  """
55
  Converts a Word document to a PDF file.
56
- Handles images carefully using reportlab.
57
  """
58
  try:
59
  # Create a temporary directory to store the output file
@@ -114,4 +106,4 @@ with gr.Blocks() as app:
114
 
115
  convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output])
116
 
117
- app.launch()
 
4
  from fpdf import FPDF
5
  import os
6
  import tempfile
7
+ from pdfminer.high_level import extract_text
 
8
  from reportlab.lib.pagesizes import A4
9
  from reportlab.pdfgen import canvas
10
  from io import BytesIO
11
 
 
 
 
12
  title_and_description = """
13
  # PDF to Word and Word to PDF Converter
14
 
15
  This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
16
+ Note: Scanned PDFs (image-based PDFs) are not supported.
17
  """
18
 
19
  def pdf_to_word(pdf_file):
20
  """
21
+ Converts a text-based PDF file to a Word document.
22
+ Scanned PDFs (image-based PDFs) are not supported.
23
  """
24
  try:
25
+ # Extract text from the PDF using pdfminer
26
+ text = extract_text(pdf_file.name)
27
+ if not text.strip():
28
+ return "Error: The PDF appears to be image-based (scanned). Scanned PDFs are not supported."
 
29
 
30
+ # Create a temporary directory to store the output file
31
  with tempfile.TemporaryDirectory() as temp_dir:
32
  docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
33
 
34
+ # Create a Word document and add the extracted text
35
  doc = Document()
36
+ for line in text.splitlines():
37
+ doc.add_paragraph(line)
 
 
 
38
  doc.save(docx_filename)
39
 
40
  # Return the path to the converted file
 
45
  def word_to_pdf(docx_file):
46
  """
47
  Converts a Word document to a PDF file.
48
+ Handles text and basic formatting.
49
  """
50
  try:
51
  # Create a temporary directory to store the output file
 
106
 
107
  convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output])
108
 
109
+ app.launch(share=True)