File size: 6,709 Bytes
4b0678e d5b5b0f d2203e8 d5b5b0f 5f1077a c577758 d9c1e67 5f1077a d99955f c577758 5d2e8ec d99955f c577758 5f1077a ba611cd 0772fb4 59e60e9 d9c1e67 c577758 d5b5b0f e0f3c82 d5b5b0f d2203e8 339b0e9 d2203e8 d5b5b0f 5f1077a c577758 d2203e8 d5b5b0f c577758 59e60e9 5d2e8ec ba611cd 0772fb4 d9c1e67 59e60e9 d2203e8 d5b5b0f 59e60e9 5f1077a fac9a75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
import gradio as gr
import warnings
from typing import List
import json
from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf
from base_utils import (
convert_pdf_to_image,
extract_text_from_pdf,
convert_doc_to_text,
extract_text_from_docx,
extract_text_from_ppt,
extract_text_from_pptx,
sanitize_list_of_lists,
parse_url,
)
pdf_to_img = gr.Interface(
convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
)
pdf_to_text = gr.Interface(
extract_text_from_pdf,
gr.File(),
gr.Textbox(placeholder="Extracted text will appear here"),
api_name="pdf_to_text",
)
doc_to_text = gr.Interface(
convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text"
)
docx_to_text = gr.Interface(
extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text"
)
ppt_to_text = gr.Interface(
extract_text_from_ppt,
gr.File(),
gr.Textbox(),
api_name="ppt_to_text",
)
pptx_to_text = gr.Interface(
extract_text_from_pptx,
gr.File(),
gr.Textbox(),
api_name="pptx_to_text",
)
str_to_json = gr.Interface(
sanitize_list_of_lists,
gr.Text(),
gr.JSON(),
api_name="str_to_json",
examples=[
"""[
["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
]"""
],
)
url_parser = gr.Interface(
parse_url,
inputs=["text"],
outputs=["text"],
api_name="url_to_text",
)
class FileNotConvertedWarning(Warning):
"""The file was not in one of the specified formats for conversion to PDF"""
pass
def to_pdf(files: List[str]) -> List[str]:
pdfs = []
for f in files:
if f.endswith(".docx"):
newfile = f.replace(".docx", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".pdf"):
pdfs.append(f)
elif f.endswith(".html"):
newfile = f.replace(".html", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".pptx"):
newfile = f.replace(".pptx", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".csv"):
newfile = f.replace(".csv", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".xml"):
newfile = f.replace(".xml", ".pdf")
file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
elif f.endswith(".md"):
newfile = f.replace(".md", ".pdf")
file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0])
pdfs.append(file_to_add)
else:
warnings.warn(
f"File {f} was not converted to PDF because its file format is not included in those that can be converted",
FileNotConvertedWarning,
)
continue
return pdfs
def convert(file: str) -> str:
files = [file]
pdfs = to_pdf(files)
return pdfs
def parse_MCQs(mcq_string: str) -> List[List[str]]:
mcq_string = "[" + mcq_string.split("[", 1)[1]
json_data = mcq_string.rsplit("]", 1)[0] + "]"
json_data = json.loads(json_data)
return json_data
mcqs_to_json = gr.Interface(
parse_MCQs,
gr.Textbox(),
gr.JSON(),
api_name="mcqs_to_json",
examples=[
[
"""```json
[
{
"question": "Which of the following best describes the nature of business?",
"options": {
"A": "It is primarily a non-economic activity",
"B": "It involves personal consumption of goods",
"C": "It includes regular and continuous transactions for profit",
"D": "It excludes exchange of goods and services"
},
"answer": "C"
},
{
"question": "According to the document, what is a primary objective of business under economic objectives?",
"options": {
"A": "Employee welfare",
"B": "Profit earning",
"C": "Creating entertainment content",
"D": "Reducing government involvement"
},
"answer": "B"
},
{
"question": "Which of the following is a component of commerce?",
"options": {
"A": "Mining",
"B": "Manufacturing",
"C": "Warehousing",
"D": "Farming"
},
"answer": "C"
},
{
"question": "What is an example of a synthetic manufacturing industry?",
"options": {
"A": "Oil refining",
"B": "Textile processing",
"C": "Soap production",
"D": "Watch assembly"
},
"answer": "C"
},
{
"question": "Which aid to trade helps in overcoming the hindrance of knowledge in commerce?",
"options": {
"A": "Banking",
"B": "Insurance",
"C": "Advertising",
"D": "Warehousing"
},
"answer": "C"
}
]
```
"""
]
],
cache_examples=False,
)
pdf_converter = gr.Interface(
fn=convert,
inputs=gr.File(label="Upload your file"),
outputs=gr.File(label="Converted PDF"),
title="File to PDF Converter",
description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.",
api_name="convert_to_pdf",
)
demo = gr.TabbedInterface(
[
pdf_to_img,
pdf_to_text,
doc_to_text,
docx_to_text,
ppt_to_text,
pptx_to_text,
url_parser,
str_to_json,
mcqs_to_json,
pdf_converter,
],
[
"PDF to Image",
"Extract PDF Text",
"Extract DOC Text",
"Extract DOCX Text",
"Extract PPT Text",
"Extract PPTX Text",
"Extract text from URL",
"Extract Json",
"Parse MCQs",
"Convert to PDF",
],
)
demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)
|