utils / app.py
not-lain's picture
fix missing return
339b0e9
raw
history blame
6.71 kB
import gradio as gr
import warnings
from typing import List
import json
from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf
from base_utils import (
convert_pdf_to_image,
extract_text_from_pdf,
convert_doc_to_text,
extract_text_from_docx,
extract_text_from_ppt,
extract_text_from_pptx,
sanitize_list_of_lists,
parse_url,
)
pdf_to_img = gr.Interface(
convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
)
pdf_to_text = gr.Interface(
extract_text_from_pdf,
gr.File(),
gr.Textbox(placeholder="Extracted text will appear here"),
api_name="pdf_to_text",
)
doc_to_text = gr.Interface(
convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text"
)
docx_to_text = gr.Interface(
extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text"
)
ppt_to_text = gr.Interface(
extract_text_from_ppt,
gr.File(),
gr.Textbox(),
api_name="ppt_to_text",
)
pptx_to_text = gr.Interface(
extract_text_from_pptx,
gr.File(),
gr.Textbox(),
api_name="pptx_to_text",
)
str_to_json = gr.Interface(
sanitize_list_of_lists,
gr.Text(),
gr.JSON(),
api_name="str_to_json",
examples=[
"""[
["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
]"""
],
)
url_parser = gr.Interface(
parse_url,
inputs=["text"],
outputs=["text"],
api_name="url_to_text",
)
class FileNotConvertedWarning(Warning):
"""The file was not in one of the specified formats for conversion to PDF"""
pass
def to_pdf(files: List[str]) -> List[str]:
pdfs = []
for f in files:
if f.endswith(".docx"):
newfile = f.replace(".docx", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".pdf"):
pdfs.append(f)
elif f.endswith(".html"):
newfile = f.replace(".html", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".pptx"):
newfile = f.replace(".pptx", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".csv"):
newfile = f.replace(".csv", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".xml"):
newfile = f.replace(".xml", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".md"):
newfile = f.replace(".md", ".pdf")
file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
else:
warnings.warn(
f"File {f} was not converted to PDF because its file format is not included in those that can be converted",
FileNotConvertedWarning,
)
continue
return pdfs
def convert(file: str) -> str:
files = [file]
pdfs = to_pdf(files)
return pdfs
def parse_MCQs(mcq_string: str) -> List[List[str]]:
mcq_string = "[" + mcq_string.split("[", 1)[1]
json_data = mcq_string.rsplit("]", 1)[0] + "]"
json_data = json.loads(json_data)
return json_data
mcqs_to_json = gr.Interface(
parse_MCQs,
gr.Textbox(),
gr.JSON(),
api_name="mcqs_to_json",
examples=[
[
"""```json
[
{
"question": "Which of the following best describes the nature of business?",
"options": {
"A": "It is primarily a non-economic activity",
"B": "It involves personal consumption of goods",
"C": "It includes regular and continuous transactions for profit",
"D": "It excludes exchange of goods and services"
},
"answer": "C"
},
{
"question": "According to the document, what is a primary objective of business under economic objectives?",
"options": {
"A": "Employee welfare",
"B": "Profit earning",
"C": "Creating entertainment content",
"D": "Reducing government involvement"
},
"answer": "B"
},
{
"question": "Which of the following is a component of commerce?",
"options": {
"A": "Mining",
"B": "Manufacturing",
"C": "Warehousing",
"D": "Farming"
},
"answer": "C"
},
{
"question": "What is an example of a synthetic manufacturing industry?",
"options": {
"A": "Oil refining",
"B": "Textile processing",
"C": "Soap production",
"D": "Watch assembly"
},
"answer": "C"
},
{
"question": "Which aid to trade helps in overcoming the hindrance of knowledge in commerce?",
"options": {
"A": "Banking",
"B": "Insurance",
"C": "Advertising",
"D": "Warehousing"
},
"answer": "C"
}
]
```
"""
]
],
cache_examples=False,
)
pdf_converter = gr.Interface(
fn=convert,
inputs=gr.File(label="Upload your file"),
outputs=gr.File(label="Converted PDF"),
title="File to PDF Converter",
description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.",
api_name="convert_to_pdf",
)
demo = gr.TabbedInterface(
[
pdf_to_img,
pdf_to_text,
doc_to_text,
docx_to_text,
ppt_to_text,
pptx_to_text,
url_parser,
str_to_json,
mcqs_to_json,
pdf_converter,
],
[
"PDF to Image",
"Extract PDF Text",
"Extract DOC Text",
"Extract DOCX Text",
"Extract PPT Text",
"Extract PPTX Text",
"Extract text from URL",
"Extract Json",
"Parse MCQs",
"Convert to PDF",
],
)
demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)