Spaces:

not-lain
/

tools

Running

App Files Files Community

tools / app.py

not-lain

add pdf to word conversion tab

579e5f2 9 days ago

raw

history blame

3.97 kB

	import gradio as gr
	from pdfitdown.pdfconversion import Converter
	import fitz
	from typing import List
	from PIL import Image
	from loadimg import load_img
	import io
	from markitdown import MarkItDown
	from docx import Document
	import pdfplumber

	converter = Converter()
	md = MarkItDown()


	def convert_file_to_pdf(filename: str) -> str:
	"""
	Converts a markdown file to PDF format.

	Args:
	filename: str
	The path to the markdown file to be converted.

	Returns:
	str: The file path of the generated PDF file.
	"""
	output_path = filename.name.rsplit(".", 1)[0] + ".pdf"
	converter.convert(filename.name, output_path)
	return output_path


	def convert_file_to_img(image_file: str = None, txt: str = "") -> List[Image.Image]:
	"""
	Convert an image file to PDF format.

	Args:
	image_file: A file object containing the image to be converted.
	The file must be in a format supported by the converter
	(e.g., PNG, JPG, JPEG).

	Returns:
	str: The file path of the generated PDF file. The output filename will be
	the same as the input filename but with a .pdf extension.
	"""
	img_list = []
	if txt != "":
	img_list.append(load_img(txt, output_type="pil"))
	if image_file is not None:
	output_path = image_file.name.rsplit(".", 1)[0] + ".pdf"
	converter.convert(image_file.name, output_path)
	doc = fitz.open(output_path)
	for page in doc:
	page_bytes = page.get_pixmap().tobytes("png")
	img_list.append(load_img(Image.open(io.BytesIO(page_bytes))).convert("RGB"))
	doc.close()
	return img_list


	def convert_file_to_markdown(filename: str) -> str:
	"""
	Converts a file to markdown format using markitdown.
	Args:
	filename: str
	The path to the file to be converted.
	Returns:
	str: The markdown representation of the file.
	"""
	return md.convert(filename.name).text_content


	def convert_pdf_to_word(filename: str) -> str:
	"""
	Converts a PDF file to Word format.

	Args:
	filename: str
	The path to the PDF file to be converted.

	Returns:
	str: The file path of the generated Word file.
	"""
	output_path = filename.name.rsplit(".", 1)[0] + ".docx"
	doc = Document()

	with pdfplumber.open(filename.name) as pdf:
	for page in pdf.pages:
	text = page.extract_text()
	if text:
	doc.add_paragraph(text)

	doc.save(output_path)
	return output_path


	# Create individual interfaces
	file_to_pdf = gr.Interface(
	fn=convert_file_to_pdf,
	inputs=gr.File(label="Upload README/Markdown file"),
	outputs=gr.File(label="Converted PDF"),
	title="File to PDF Converter",
	description="Convert your files to PDF format",
	)

	file_to_image = gr.Interface(
	fn=convert_file_to_img,
	inputs=[gr.File(label="Upload Image"), gr.Textbox(label="base64, url")],
	outputs=gr.Gallery(label="Converted Images"),
	title="File to Images Converter",
	description="Convert your images to an image format",
	)

	file_to_markdown = gr.Interface(
	fn=convert_file_to_markdown,
	inputs=gr.File(label="Upload File"),
	outputs=gr.Textbox(label="Converted Markdown"),
	title="File to Markdown Converter",
	description="Convert your files to markdown format",
	)

	pdf_to_word = gr.Interface(
	fn=convert_pdf_to_word,
	inputs=gr.File(label="Upload PDF file"),
	outputs=gr.File(label="Converted Word Document"),
	title="PDF to Word Converter",
	description="Convert your PDF files to Word format",
	)

	# Create tabbed interface
	demo = gr.TabbedInterface(
	[file_to_pdf, file_to_image, file_to_markdown, pdf_to_word],
	["File to PDF", "File to Image", "File to Markdown", "PDF to Word"],
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, mcp_server=True)