Spaces:

not-lain
/

utils

Running

File size: 6,709 Bytes

import gradio as gr
import warnings
from typing import List
import json
from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf

from base_utils import (
    convert_pdf_to_image,
    extract_text_from_pdf,
    convert_doc_to_text,
    extract_text_from_docx,
    extract_text_from_ppt,
    extract_text_from_pptx,
    sanitize_list_of_lists,
    parse_url,
)

pdf_to_img = gr.Interface(
    convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
)
pdf_to_text = gr.Interface(
    extract_text_from_pdf,
    gr.File(),
    gr.Textbox(placeholder="Extracted text will appear here"),
    api_name="pdf_to_text",
)

doc_to_text = gr.Interface(
    convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text"
)
docx_to_text = gr.Interface(
    extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text"
)

ppt_to_text = gr.Interface(
    extract_text_from_ppt,
    gr.File(),
    gr.Textbox(),
    api_name="ppt_to_text",
)

pptx_to_text = gr.Interface(
    extract_text_from_pptx,
    gr.File(),
    gr.Textbox(),
    api_name="pptx_to_text",
)
str_to_json = gr.Interface(
    sanitize_list_of_lists,
    gr.Text(),
    gr.JSON(),
    api_name="str_to_json",
    examples=[
        """[
  ["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
  ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
  ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
  ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
  ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
  ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
  ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
  ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
]"""
    ],
)

url_parser = gr.Interface(
    parse_url,
    inputs=["text"],
    outputs=["text"],
    api_name="url_to_text",
)


class FileNotConvertedWarning(Warning):
    """The file was not in one of the specified formats for conversion to PDF"""

    pass


def to_pdf(files: List[str]) -> List[str]:
    pdfs = []
    for f in files:
        if f.endswith(".docx"):
            newfile = f.replace(".docx", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".pdf"):
            pdfs.append(f)
        elif f.endswith(".html"):
            newfile = f.replace(".html", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".pptx"):
            newfile = f.replace(".pptx", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".csv"):
            newfile = f.replace(".csv", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".xml"):
            newfile = f.replace(".xml", ".pdf")
            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        elif f.endswith(".md"):
            newfile = f.replace(".md", ".pdf")
            file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0])
            pdfs.append(file_to_add)
        else:
            warnings.warn(
                f"File {f} was not converted to PDF because its file format is not included in those that can be converted",
                FileNotConvertedWarning,
            )
            continue
    return pdfs


def convert(file: str) -> str:
    files = [file]
    pdfs = to_pdf(files)
    return pdfs


def parse_MCQs(mcq_string: str) -> List[List[str]]:
    mcq_string = "[" + mcq_string.split("[", 1)[1]
    json_data = mcq_string.rsplit("]", 1)[0] + "]"
    json_data = json.loads(json_data)
    return json_data


mcqs_to_json = gr.Interface(
    parse_MCQs,
    gr.Textbox(),
    gr.JSON(),
    api_name="mcqs_to_json",
    examples=[
        [
            """```json
        [
  {
    "question": "Which of the following best describes the nature of business?",
    "options": {
      "A": "It is primarily a non-economic activity",
      "B": "It involves personal consumption of goods",
      "C": "It includes regular and continuous transactions for profit",
      "D": "It excludes exchange of goods and services"
    },
    "answer": "C"
  },
  {
    "question": "According to the document, what is a primary objective of business under economic objectives?",
    "options": {
      "A": "Employee welfare",
      "B": "Profit earning",
      "C": "Creating entertainment content",
      "D": "Reducing government involvement"
    },
    "answer": "B"
  },
  {
    "question": "Which of the following is a component of commerce?",
    "options": {
      "A": "Mining",
      "B": "Manufacturing",
      "C": "Warehousing",
      "D": "Farming"
    },
    "answer": "C"
  },
  {
    "question": "What is an example of a synthetic manufacturing industry?",
    "options": {
      "A": "Oil refining",
      "B": "Textile processing",
      "C": "Soap production",
      "D": "Watch assembly"
    },
    "answer": "C"
  },
  {
    "question": "Which aid to trade helps in overcoming the hindrance of knowledge in commerce?",
    "options": {
      "A": "Banking",
      "B": "Insurance",
      "C": "Advertising",
      "D": "Warehousing"
    },
    "answer": "C"
  }
]
```
"""
        ]
    ],
    cache_examples=False,
)

pdf_converter = gr.Interface(
    fn=convert,
    inputs=gr.File(label="Upload your file"),
    outputs=gr.File(label="Converted PDF"),
    title="File to PDF Converter",
    description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.",
    api_name="convert_to_pdf",
)

demo = gr.TabbedInterface(
    [
        pdf_to_img,
        pdf_to_text,
        doc_to_text,
        docx_to_text,
        ppt_to_text,
        pptx_to_text,
        url_parser,
        str_to_json,
        mcqs_to_json,
        pdf_converter,
    ],
    [
        "PDF to Image",
        "Extract PDF Text",
        "Extract DOC Text",
        "Extract DOCX Text",
        "Extract PPT Text",
        "Extract PPTX Text",
        "Extract text from URL",
        "Extract Json",
        "Parse MCQs",
        "Convert to PDF",
    ],
)

demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)