Spaces:

not-lain
/

utils

Running

App Files Files Community

utils / app.py

not-lain

fix missing return

339b0e9 29 days ago

raw

history blame

6.71 kB

	import gradio as gr
	import warnings
	from typing import List
	import json
	from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf

	from base_utils import (
	convert_pdf_to_image,
	extract_text_from_pdf,
	convert_doc_to_text,
	extract_text_from_docx,
	extract_text_from_ppt,
	extract_text_from_pptx,
	sanitize_list_of_lists,
	parse_url,
	)

	pdf_to_img = gr.Interface(
	convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
	)
	pdf_to_text = gr.Interface(
	extract_text_from_pdf,
	gr.File(),
	gr.Textbox(placeholder="Extracted text will appear here"),
	api_name="pdf_to_text",
	)

	doc_to_text = gr.Interface(
	convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text"
	)
	docx_to_text = gr.Interface(
	extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text"
	)

	ppt_to_text = gr.Interface(
	extract_text_from_ppt,
	gr.File(),
	gr.Textbox(),
	api_name="ppt_to_text",
	)

	pptx_to_text = gr.Interface(
	extract_text_from_pptx,
	gr.File(),
	gr.Textbox(),
	api_name="pptx_to_text",
	)
	str_to_json = gr.Interface(
	sanitize_list_of_lists,
	gr.Text(),
	gr.JSON(),
	api_name="str_to_json",
	examples=[
	"""[
	["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
	["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
	["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
	["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
	["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
	["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
	["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
	["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
	]"""
	],
	)

	url_parser = gr.Interface(
	parse_url,
	inputs=["text"],
	outputs=["text"],
	api_name="url_to_text",
	)


	class FileNotConvertedWarning(Warning):
	"""The file was not in one of the specified formats for conversion to PDF"""

	pass


	def to_pdf(files: List[str]) -> List[str]:
	pdfs = []
	for f in files:
	if f.endswith(".docx"):
	newfile = f.replace(".docx", ".pdf")
	file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
	pdfs.append(file_to_add)
	elif f.endswith(".pdf"):
	pdfs.append(f)
	elif f.endswith(".html"):
	newfile = f.replace(".html", ".pdf")
	file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
	pdfs.append(file_to_add)
	elif f.endswith(".pptx"):
	newfile = f.replace(".pptx", ".pdf")
	file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
	pdfs.append(file_to_add)
	elif f.endswith(".csv"):
	newfile = f.replace(".csv", ".pdf")
	file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
	pdfs.append(file_to_add)
	elif f.endswith(".xml"):
	newfile = f.replace(".xml", ".pdf")
	file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
	pdfs.append(file_to_add)
	elif f.endswith(".md"):
	newfile = f.replace(".md", ".pdf")
	file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0])
	pdfs.append(file_to_add)
	else:
	warnings.warn(
	f"File {f} was not converted to PDF because its file format is not included in those that can be converted",
	FileNotConvertedWarning,
	)
	continue
	return pdfs


	def convert(file: str) -> str:
	files = [file]
	pdfs = to_pdf(files)
	return pdfs


	def parse_MCQs(mcq_string: str) -> List[List[str]]:
	mcq_string = "[" + mcq_string.split("[", 1)[1]
	json_data = mcq_string.rsplit("]", 1)[0] + "]"
	json_data = json.loads(json_data)
	return json_data


	mcqs_to_json = gr.Interface(
	parse_MCQs,
	gr.Textbox(),
	gr.JSON(),
	api_name="mcqs_to_json",
	examples=[
	[
	"""```json
	[
	{
	"question": "Which of the following best describes the nature of business?",
	"options": {
	"A": "It is primarily a non-economic activity",
	"B": "It involves personal consumption of goods",
	"C": "It includes regular and continuous transactions for profit",
	"D": "It excludes exchange of goods and services"
	},
	"answer": "C"
	},
	{
	"question": "According to the document, what is a primary objective of business under economic objectives?",
	"options": {
	"A": "Employee welfare",
	"B": "Profit earning",
	"C": "Creating entertainment content",
	"D": "Reducing government involvement"
	},
	"answer": "B"
	},
	{
	"question": "Which of the following is a component of commerce?",
	"options": {
	"A": "Mining",
	"B": "Manufacturing",
	"C": "Warehousing",
	"D": "Farming"
	},
	"answer": "C"
	},
	{
	"question": "What is an example of a synthetic manufacturing industry?",
	"options": {
	"A": "Oil refining",
	"B": "Textile processing",
	"C": "Soap production",
	"D": "Watch assembly"
	},
	"answer": "C"
	},
	{
	"question": "Which aid to trade helps in overcoming the hindrance of knowledge in commerce?",
	"options": {
	"A": "Banking",
	"B": "Insurance",
	"C": "Advertising",
	"D": "Warehousing"
	},
	"answer": "C"
	}
	]
	```
	"""
	]
	],
	cache_examples=False,
	)

	pdf_converter = gr.Interface(
	fn=convert,
	inputs=gr.File(label="Upload your file"),
	outputs=gr.File(label="Converted PDF"),
	title="File to PDF Converter",
	description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.",
	api_name="convert_to_pdf",
	)

	demo = gr.TabbedInterface(
	[
	pdf_to_img,
	pdf_to_text,
	doc_to_text,
	docx_to_text,
	ppt_to_text,
	pptx_to_text,
	url_parser,
	str_to_json,
	mcqs_to_json,
	pdf_converter,
	],
	[
	"PDF to Image",
	"Extract PDF Text",
	"Extract DOC Text",
	"Extract DOCX Text",
	"Extract PPT Text",
	"Extract PPTX Text",
	"Extract text from URL",
	"Extract Json",
	"Parse MCQs",
	"Convert to PDF",
	],
	)

	demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)