prithivMLmods commited on
Commit
4194e87
·
verified ·
1 Parent(s): 59a53f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -99
app.py CHANGED
@@ -1,109 +1,66 @@
1
  import gradio as gr
2
- from pdf2docx import Converter
3
  from docx import Document
4
- from fpdf import FPDF
5
- import os
6
- import tempfile
7
- from pdfminer.high_level import extract_text
8
- from reportlab.lib.pagesizes import A4
9
  from reportlab.pdfgen import canvas
10
- from io import BytesIO
11
-
12
- title_and_description = """
13
- # PDF to Word and Word to PDF Converter
14
-
15
- This tool allows you to convert PDF files to Word documents and Word documents to PDF files.
16
- Note: Scanned PDFs (image-based PDFs) are not supported.
17
- """
18
 
 
19
  def pdf_to_word(pdf_file):
20
- """
21
- Converts a text-based PDF file to a Word document.
22
- Scanned PDFs (image-based PDFs) are not supported.
23
- """
24
- try:
25
- # Extract text from the PDF using pdfminer
26
- text = extract_text(pdf_file.name)
27
- if not text.strip():
28
- return "Error: The PDF appears to be image-based (scanned). Scanned PDFs are not supported."
29
-
30
- # Create a temporary directory to store the output file
31
- with tempfile.TemporaryDirectory() as temp_dir:
32
- docx_filename = os.path.join(temp_dir, os.path.basename(pdf_file.name).replace('.pdf', '.docx'))
33
-
34
- # Create a Word document and add the extracted text
35
- doc = Document()
36
- for line in text.splitlines():
37
- doc.add_paragraph(line)
38
- doc.save(docx_filename)
39
-
40
- # Return the path to the converted file
41
- return docx_filename
42
- except Exception as e:
43
- return f"Error: {e}"
44
 
 
45
  def word_to_pdf(docx_file):
46
- """
47
- Converts a Word document to a PDF file.
48
- Handles text and basic formatting.
49
- """
50
- try:
51
- # Create a temporary directory to store the output file
52
- with tempfile.TemporaryDirectory() as temp_dir:
53
- pdf_filename = os.path.join(temp_dir, "output.pdf")
54
-
55
- # Create a PDF using reportlab
56
- packet = BytesIO()
57
- can = canvas.Canvas(packet, pagesize=A4)
58
- can.setFont("Helvetica", 12)
59
-
60
- # Read the Word document
61
- doc = Document(docx_file.name)
62
- y = 800 # Starting y-coordinate for text
63
-
64
- for para in doc.paragraphs:
65
- text = para.text.strip()
66
- if not text:
67
- continue
68
-
69
- # Add text to the PDF
70
- can.drawString(100, y, text)
71
- y -= 15 # Move down for the next line
72
-
73
- # Handle page breaks
74
- if y < 50:
75
- can.showPage()
76
- y = 800
77
-
78
- # Save the PDF
79
- can.save()
80
- packet.seek(0)
81
- with open(pdf_filename, "wb") as f:
82
- f.write(packet.read())
83
-
84
- # Return the path to the converted file
85
- return pdf_filename
86
- except Exception as e:
87
- return f"Error: {e}"
88
 
89
- with gr.Blocks() as app:
90
- gr.Markdown(title_and_description)
 
 
 
 
 
 
 
91
 
92
- with gr.Row():
93
- with gr.Column():
94
- with gr.Accordion("PDF to Word"):
95
- pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
96
- convert_pdf_to_word = gr.Button("Convert to Word")
97
- word_output = gr.File(label="Download Word file", type="filepath", file_types=[".docx"])
98
-
99
- convert_pdf_to_word.click(pdf_to_word, inputs=[pdf_input], outputs=[word_output])
100
-
101
- with gr.Column():
102
- with gr.Accordion("Word to PDF"):
103
- word_input = gr.File(label="Upload Word", file_types=[".docx"])
104
- convert_word_to_pdf = gr.Button("Convert to PDF")
105
- pdf_output = gr.File(label="Download PDF file", type="filepath", file_types=[".pdf"])
106
-
107
- convert_word_to_pdf.click(word_to_pdf, inputs=[word_input], outputs=[pdf_output])
108
 
109
- app.launch(share=True)
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
  from docx import Document
 
 
 
 
 
4
  from reportlab.pdfgen import canvas
5
+ from reportlab.lib.pagesizes import letter
6
+ import io
 
 
 
 
 
 
7
 
8
+ # PDF to Word conversion function
9
  def pdf_to_word(pdf_file):
10
+ # Read PDF file using PyMuPDF
11
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
12
+ text = ""
13
+ for page in doc:
14
+ text += page.get_text()
15
+
16
+ # Create a Word document using python-docx
17
+ docx = Document()
18
+ docx.add_paragraph(text)
19
+
20
+ # Save the Word document to a bytes buffer
21
+ buffer = io.BytesIO()
22
+ docx.save(buffer)
23
+ buffer.seek(0)
24
+
25
+ return buffer, "converted.docx"
 
 
 
 
 
 
 
 
26
 
27
+ # Word to PDF conversion function
28
  def word_to_pdf(docx_file):
29
+ # Read Word document using python-docx
30
+ doc = Document(docx_file)
31
+ text = ""
32
+ for para in doc.paragraphs:
33
+ text += para.text + "\n"
34
+
35
+ # Create a PDF using reportlab
36
+ buffer = io.BytesIO()
37
+ c = canvas.Canvas(buffer, pagesize=letter)
38
+ textobject = c.beginText()
39
+ textobject.setTextOrigin(50, 750)
40
+ lines = text.split('\n')
41
+ for line in lines:
42
+ textobject.textLine(line)
43
+ c.drawText(textobject)
44
+ c.showPage()
45
+ c.save()
46
+ buffer.seek(0)
47
+
48
+ return buffer, "converted.pdf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Gradio interface
51
+ with gr.Blocks() as demo:
52
+ gr.Markdown("<h1>PDF to Word and Word to PDF Converter</h1>")
53
+
54
+ with gr.Tab("PDF to Word"):
55
+ pdf_input = gr.File(label="Upload PDF File", type="file")
56
+ pdf_convert_btn = gr.Button("Convert to Word")
57
+ word_output = gr.File(label="Download Word File")
58
+ pdf_convert_btn.click(pdf_to_word, inputs=pdf_input, outputs=word_output)
59
 
60
+ with gr.Tab("Word to PDF"):
61
+ word_input = gr.File(label="Upload Word File", type="file")
62
+ word_convert_btn = gr.Button("Convert to PDF")
63
+ pdf_output = gr.File(label="Download PDF File")
64
+ word_convert_btn.click(word_to_pdf, inputs=word_input, outputs=pdf_output)
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ demo.launch()