prithivMLmods commited on
Commit
61cdb02
·
verified ·
1 Parent(s): a5f8478

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -30
app.py CHANGED
@@ -4,33 +4,45 @@ from docx import Document
4
  from fpdf import FPDF
5
  import os
6
  import tempfile
 
 
 
 
 
 
 
 
7
 
8
  title_and_description = """
9
  # PDF to Word and Word to PDF Converter
10
 
11
  This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
12
- Note: Scanned PDFs (image-based PDFs) are not supported.
13
  """
14
 
15
  def pdf_to_word(pdf_file):
16
  """
17
  Converts a PDF file to a Word document.
 
18
  """
19
  try:
20
- # Check if the PDF is scanned (image-based)
21
- with open(pdf_file.name, 'rb') as f:
22
- first_page = f.read(1024) # Read the first 1024 bytes of the PDF
23
- if b"/Image" in first_page or b"/XObject" in first_page:
24
- return "Error: Scanned PDFs (image-based PDFs) are not supported."
25
-
26
- # Create a temporary directory to store the output file
27
  with tempfile.TemporaryDirectory() as temp_dir:
28
  docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
29
 
30
- # Convert PDF to Word
31
- cv = Converter(pdf_file.name)
32
- cv.convert(docx_filename, multi_processing=True, start=0, end=None)
33
- cv.close()
 
 
 
 
 
 
 
 
 
34
 
35
  # Return the path to the converted file
36
  return docx_filename
@@ -40,38 +52,42 @@ def pdf_to_word(pdf_file):
40
  def word_to_pdf(docx_file):
41
  """
42
  Converts a Word document to a PDF file.
 
43
  """
44
  try:
45
  # Create a temporary directory to store the output file
46
  with tempfile.TemporaryDirectory() as temp_dir:
47
  pdf_filename = os.path.join(temp_dir, "output.pdf")
48
 
49
- # Convert Word to PDF
 
 
 
 
 
50
  doc = Document(docx_file.name)
51
- pdf = FPDF(format='A4')
52
- pdf.set_auto_page_break(auto=True, margin=15)
53
- pdf.add_page()
54
- pdf.add_font('Arial', '', 'Arial.ttf', uni=True)
55
- pdf.set_font('Arial', size=12)
56
 
57
  for para in doc.paragraphs:
58
  text = para.text.strip()
59
  if not text:
60
  continue
61
 
62
- words = text.split()
63
- line = ''
64
- for word in words:
65
- if pdf.get_string_width(line + word) < (pdf.w - 2 * pdf.l_margin):
66
- line += word + ' '
67
- else:
68
- pdf.cell(0, 10, line, ln=True)
69
- line = word + ' '
70
- if line:
71
- pdf.cell(0, 10, line, ln=True)
 
 
 
 
72
 
73
- pdf.output(pdf_filename)
74
-
75
  # Return the path to the converted file
76
  return pdf_filename
77
  except Exception as e:
 
4
  from fpdf import FPDF
5
  import os
6
  import tempfile
7
+ import pytesseract
8
+ from pdf2image import convert_from_path
9
+ from reportlab.lib.pagesizes import A4
10
+ from reportlab.pdfgen import canvas
11
+ from io import BytesIO
12
+
13
+ # Ensure Tesseract is installed and its path is set
14
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path to your Tesseract installation
15
 
16
  title_and_description = """
17
  # PDF to Word and Word to PDF Converter
18
 
19
  This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
20
+ Note: Scanned PDFs (image-based PDFs) are supported using OCR.
21
  """
22
 
23
  def pdf_to_word(pdf_file):
24
  """
25
  Converts a PDF file to a Word document.
26
+ Handles both text-based and image-based (scanned) PDFs.
27
  """
28
  try:
29
+ # Create a temporary directory to store intermediate files
 
 
 
 
 
 
30
  with tempfile.TemporaryDirectory() as temp_dir:
31
  docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
32
 
33
+ # Check if the PDF is text-based or image-based
34
+ pages = convert_from_path(pdf_file.name, 500) # Convert PDF to images
35
+ if len(pages) == 0:
36
+ return "Error: Unable to process the PDF file."
37
+
38
+ # If the PDF is image-based, use OCR to extract text
39
+ doc = Document()
40
+ for page in pages:
41
+ text = pytesseract.image_to_string(page) # Extract text from the image
42
+ doc.add_paragraph(text) # Add the extracted text to the Word document
43
+
44
+ # Save the Word document
45
+ doc.save(docx_filename)
46
 
47
  # Return the path to the converted file
48
  return docx_filename
 
52
  def word_to_pdf(docx_file):
53
  """
54
  Converts a Word document to a PDF file.
55
+ Handles images carefully using reportlab.
56
  """
57
  try:
58
  # Create a temporary directory to store the output file
59
  with tempfile.TemporaryDirectory() as temp_dir:
60
  pdf_filename = os.path.join(temp_dir, "output.pdf")
61
 
62
+ # Create a PDF using reportlab
63
+ packet = BytesIO()
64
+ can = canvas.Canvas(packet, pagesize=A4)
65
+ can.setFont("Helvetica", 12)
66
+
67
+ # Read the Word document
68
  doc = Document(docx_file.name)
69
+ y = 800 # Starting y-coordinate for text
 
 
 
 
70
 
71
  for para in doc.paragraphs:
72
  text = para.text.strip()
73
  if not text:
74
  continue
75
 
76
+ # Add text to the PDF
77
+ can.drawString(100, y, text)
78
+ y -= 15 # Move down for the next line
79
+
80
+ # Handle page breaks
81
+ if y < 50:
82
+ can.showPage()
83
+ y = 800
84
+
85
+ # Save the PDF
86
+ can.save()
87
+ packet.seek(0)
88
+ with open(pdf_filename, "wb") as f:
89
+ f.write(packet.read())
90
 
 
 
91
  # Return the path to the converted file
92
  return pdf_filename
93
  except Exception as e: