|
import gradio as gr |
|
import warnings |
|
from typing import List |
|
import json |
|
from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf |
|
|
|
from base_utils import ( |
|
convert_pdf_to_image, |
|
extract_text_from_pdf, |
|
convert_doc_to_text, |
|
extract_text_from_docx, |
|
extract_text_from_ppt, |
|
extract_text_from_pptx, |
|
sanitize_list_of_lists, |
|
parse_url, |
|
) |
|
|
|
pdf_to_img = gr.Interface( |
|
convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img" |
|
) |
|
pdf_to_text = gr.Interface( |
|
extract_text_from_pdf, |
|
gr.File(), |
|
gr.Textbox(placeholder="Extracted text will appear here"), |
|
api_name="pdf_to_text", |
|
) |
|
|
|
doc_to_text = gr.Interface( |
|
convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text" |
|
) |
|
docx_to_text = gr.Interface( |
|
extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text" |
|
) |
|
|
|
ppt_to_text = gr.Interface( |
|
extract_text_from_ppt, |
|
gr.File(), |
|
gr.Textbox(), |
|
api_name="ppt_to_text", |
|
) |
|
|
|
pptx_to_text = gr.Interface( |
|
extract_text_from_pptx, |
|
gr.File(), |
|
gr.Textbox(), |
|
api_name="pptx_to_text", |
|
) |
|
str_to_json = gr.Interface( |
|
sanitize_list_of_lists, |
|
gr.Text(), |
|
gr.JSON(), |
|
api_name="str_to_json", |
|
examples=[ |
|
"""[ |
|
["What year was the Carthaginian Empire founded?", "Around 814 BCE"], |
|
["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"], |
|
["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"], |
|
["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"], |
|
["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"], |
|
["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"], |
|
["In what year was Carthage captured and destroyed by Rome?", "146 BCE"], |
|
["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"] |
|
]""" |
|
], |
|
) |
|
|
|
url_parser = gr.Interface( |
|
parse_url, |
|
inputs=["text"], |
|
outputs=["text"], |
|
api_name="url_to_text", |
|
) |
|
|
|
|
|
class FileNotConvertedWarning(Warning): |
|
"""The file was not in one of the specified formats for conversion to PDF""" |
|
|
|
pass |
|
|
|
|
|
def to_pdf(files: List[str]) -> List[str]: |
|
pdfs = [] |
|
for f in files: |
|
if f.endswith(".docx"): |
|
newfile = f.replace(".docx", ".pdf") |
|
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) |
|
pdfs.append(file_to_add) |
|
elif f.endswith(".pdf"): |
|
pdfs.append(f) |
|
elif f.endswith(".html"): |
|
newfile = f.replace(".html", ".pdf") |
|
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) |
|
pdfs.append(file_to_add) |
|
elif f.endswith(".pptx"): |
|
newfile = f.replace(".pptx", ".pdf") |
|
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) |
|
pdfs.append(file_to_add) |
|
elif f.endswith(".csv"): |
|
newfile = f.replace(".csv", ".pdf") |
|
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) |
|
pdfs.append(file_to_add) |
|
elif f.endswith(".xml"): |
|
newfile = f.replace(".xml", ".pdf") |
|
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) |
|
pdfs.append(file_to_add) |
|
elif f.endswith(".md"): |
|
newfile = f.replace(".md", ".pdf") |
|
file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0]) |
|
pdfs.append(file_to_add) |
|
else: |
|
warnings.warn( |
|
f"File {f} was not converted to PDF because its file format is not included in those that can be converted", |
|
FileNotConvertedWarning, |
|
) |
|
continue |
|
return pdfs |
|
|
|
|
|
def convert(file: str) -> str: |
|
files = [file] |
|
pdfs = to_pdf(files) |
|
return pdfs |
|
|
|
|
|
def parse_MCQs(mcq_string: str) -> List[List[str]]: |
|
mcq_string = "[" + mcq_string.split("[", 1)[1] |
|
json_data = mcq_string.rsplit("]", 1)[0] + "]" |
|
json_data = json.loads(json_data) |
|
return json_data |
|
|
|
|
|
mcqs_to_json = gr.Interface( |
|
parse_MCQs, |
|
gr.Textbox(), |
|
gr.JSON(), |
|
api_name="mcqs_to_json", |
|
examples=[ |
|
[ |
|
"""```json |
|
[ |
|
{ |
|
"question": "Which of the following best describes the nature of business?", |
|
"options": { |
|
"A": "It is primarily a non-economic activity", |
|
"B": "It involves personal consumption of goods", |
|
"C": "It includes regular and continuous transactions for profit", |
|
"D": "It excludes exchange of goods and services" |
|
}, |
|
"answer": "C" |
|
}, |
|
{ |
|
"question": "According to the document, what is a primary objective of business under economic objectives?", |
|
"options": { |
|
"A": "Employee welfare", |
|
"B": "Profit earning", |
|
"C": "Creating entertainment content", |
|
"D": "Reducing government involvement" |
|
}, |
|
"answer": "B" |
|
}, |
|
{ |
|
"question": "Which of the following is a component of commerce?", |
|
"options": { |
|
"A": "Mining", |
|
"B": "Manufacturing", |
|
"C": "Warehousing", |
|
"D": "Farming" |
|
}, |
|
"answer": "C" |
|
}, |
|
{ |
|
"question": "What is an example of a synthetic manufacturing industry?", |
|
"options": { |
|
"A": "Oil refining", |
|
"B": "Textile processing", |
|
"C": "Soap production", |
|
"D": "Watch assembly" |
|
}, |
|
"answer": "C" |
|
}, |
|
{ |
|
"question": "Which aid to trade helps in overcoming the hindrance of knowledge in commerce?", |
|
"options": { |
|
"A": "Banking", |
|
"B": "Insurance", |
|
"C": "Advertising", |
|
"D": "Warehousing" |
|
}, |
|
"answer": "C" |
|
} |
|
] |
|
``` |
|
""" |
|
] |
|
], |
|
cache_examples=False, |
|
) |
|
|
|
pdf_converter = gr.Interface( |
|
fn=convert, |
|
inputs=gr.File(label="Upload your file"), |
|
outputs=gr.File(label="Converted PDF"), |
|
title="File to PDF Converter", |
|
description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.", |
|
api_name="convert_to_pdf", |
|
) |
|
|
|
demo = gr.TabbedInterface( |
|
[ |
|
pdf_to_img, |
|
pdf_to_text, |
|
doc_to_text, |
|
docx_to_text, |
|
ppt_to_text, |
|
pptx_to_text, |
|
url_parser, |
|
str_to_json, |
|
mcqs_to_json, |
|
pdf_converter, |
|
], |
|
[ |
|
"PDF to Image", |
|
"Extract PDF Text", |
|
"Extract DOC Text", |
|
"Extract DOCX Text", |
|
"Extract PPT Text", |
|
"Extract PPTX Text", |
|
"Extract text from URL", |
|
"Extract Json", |
|
"Parse MCQs", |
|
"Convert to PDF", |
|
], |
|
) |
|
|
|
demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True) |
|
|