not-lain commited on
Commit
579e5f2
·
1 Parent(s): 1400040

add pdf to word conversion tab

Browse files
Files changed (2) hide show
  1. app.py +37 -2
  2. requirements.txt +2 -1
app.py CHANGED
@@ -6,10 +6,13 @@ from PIL import Image
6
  from loadimg import load_img
7
  import io
8
  from markitdown import MarkItDown
 
 
9
 
10
  converter = Converter()
11
  md = MarkItDown()
12
 
 
13
  def convert_file_to_pdf(filename: str) -> str:
14
  """
15
  Converts a markdown file to PDF format.
@@ -65,6 +68,30 @@ def convert_file_to_markdown(filename: str) -> str:
65
  return md.convert(filename.name).text_content
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  # Create individual interfaces
69
  file_to_pdf = gr.Interface(
70
  fn=convert_file_to_pdf,
@@ -90,10 +117,18 @@ file_to_markdown = gr.Interface(
90
  description="Convert your files to markdown format",
91
  )
92
 
 
 
 
 
 
 
 
 
93
  # Create tabbed interface
94
  demo = gr.TabbedInterface(
95
- [file_to_pdf, file_to_image, file_to_markdown],
96
- ["File to PDF", "File to Image", "File to Markdown"],
97
  )
98
 
99
  if __name__ == "__main__":
 
6
  from loadimg import load_img
7
  import io
8
  from markitdown import MarkItDown
9
+ from docx import Document
10
+ import pdfplumber
11
 
12
  converter = Converter()
13
  md = MarkItDown()
14
 
15
+
16
  def convert_file_to_pdf(filename: str) -> str:
17
  """
18
  Converts a markdown file to PDF format.
 
68
  return md.convert(filename.name).text_content
69
 
70
 
71
+ def convert_pdf_to_word(filename: str) -> str:
72
+ """
73
+ Converts a PDF file to Word format.
74
+
75
+ Args:
76
+ filename: str
77
+ The path to the PDF file to be converted.
78
+
79
+ Returns:
80
+ str: The file path of the generated Word file.
81
+ """
82
+ output_path = filename.name.rsplit(".", 1)[0] + ".docx"
83
+ doc = Document()
84
+
85
+ with pdfplumber.open(filename.name) as pdf:
86
+ for page in pdf.pages:
87
+ text = page.extract_text()
88
+ if text:
89
+ doc.add_paragraph(text)
90
+
91
+ doc.save(output_path)
92
+ return output_path
93
+
94
+
95
  # Create individual interfaces
96
  file_to_pdf = gr.Interface(
97
  fn=convert_file_to_pdf,
 
117
  description="Convert your files to markdown format",
118
  )
119
 
120
+ pdf_to_word = gr.Interface(
121
+ fn=convert_pdf_to_word,
122
+ inputs=gr.File(label="Upload PDF file"),
123
+ outputs=gr.File(label="Converted Word Document"),
124
+ title="PDF to Word Converter",
125
+ description="Convert your PDF files to Word format",
126
+ )
127
+
128
  # Create tabbed interface
129
  demo = gr.TabbedInterface(
130
+ [file_to_pdf, file_to_image, file_to_markdown, pdf_to_word],
131
+ ["File to PDF", "File to Image", "File to Markdown", "PDF to Word"],
132
  )
133
 
134
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -9,4 +9,5 @@ python-pptx
9
  pdfitdown
10
  loadimg
11
  docling
12
- markitdown
 
 
9
  pdfitdown
10
  loadimg
11
  docling
12
+ markitdown
13
+ PyMuPDF