|
import os |
|
import gradio as gr |
|
import pandas as pd |
|
from lexoid.api import parse |
|
|
|
parser_options = ["LLM_PARSE", "STATIC_PARSE", "AUTO"] |
|
|
|
|
|
|
|
def run_parser( |
|
file, |
|
parser_type, |
|
model, |
|
pages_per_split, |
|
max_processes, |
|
as_pdf, |
|
x_tolerance, |
|
y_tolerance, |
|
save_dir, |
|
page_nums, |
|
router_priority, |
|
framework, |
|
temperature, |
|
depth, |
|
google_api_key, |
|
openai_api_key, |
|
huggingfacehub_api_token, |
|
together_api_key, |
|
openrouter_api_key, |
|
): |
|
|
|
os.environ["GOOGLE_API_KEY"] = google_api_key |
|
os.environ["OPENAI_API_KEY"] = openai_api_key |
|
os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingfacehub_api_token |
|
os.environ["TOGETHER_API_KEY"] = together_api_key |
|
os.environ["OPENROUTER_API_KEY"] = openrouter_api_key |
|
|
|
if file is None: |
|
return "Please upload a file to parse." |
|
|
|
kwargs = { |
|
"model": model, |
|
"pages_per_split": pages_per_split, |
|
"max_processes": max_processes, |
|
"as_pdf": as_pdf, |
|
"x_tolerance": x_tolerance, |
|
"y_tolerance": y_tolerance, |
|
"save_dir": save_dir, |
|
"page_nums": ( |
|
[int(num.strip()) for num in page_nums.split(",")] if page_nums else None |
|
), |
|
"router_priority": router_priority, |
|
"framework": framework, |
|
"temperature": temperature, |
|
"depth": depth, |
|
} |
|
|
|
|
|
kwargs = {k: v for k, v in kwargs.items() if v is not None} |
|
|
|
result = parse(path=file.name, parser_type=parser_type, **kwargs) |
|
|
|
if "raw" in result: |
|
return result["raw"] |
|
elif "segments" in result: |
|
return "\n\n".join([seg.get("content", "") for seg in result["segments"]]) |
|
else: |
|
return str(result) |
|
|
|
|
|
with gr.Blocks(title="Lexoid Document Parser") as app: |
|
gr.Markdown( |
|
"## π Lexoid Document Parser\nUpload a document and customize how you'd like to parse it." |
|
) |
|
|
|
with gr.Row(): |
|
file_input = gr.File( |
|
label="Upload Document", |
|
file_types=[".pdf", ".docx", ".html", ".txt"], |
|
type="filepath", |
|
) |
|
parser_type = gr.Dropdown( |
|
choices=parser_options, value="AUTO", label="Parser Type" |
|
) |
|
model_input = gr.Textbox(value="gemini-2.0-flash", label="LLM ID") |
|
framework = gr.Textbox( |
|
value="pdfplumber", |
|
label="Static Framework", |
|
placeholder="e.g., pdfplumber, slate", |
|
) |
|
|
|
with gr.Accordion("Advanced Options", open=False): |
|
pages_per_split = gr.Slider( |
|
minimum=1, maximum=20, value=4, step=1, label="Pages per Split" |
|
) |
|
max_processes = gr.Slider( |
|
minimum=1, maximum=16, value=4, step=1, label="Max Parallel Processes" |
|
) |
|
as_pdf = gr.Checkbox(label="Convert to PDF before parsing") |
|
x_tolerance = gr.Number(label="X-axis Tolerance", value=None) |
|
y_tolerance = gr.Number(label="Y-axis Tolerance", value=None) |
|
save_dir = gr.Textbox( |
|
label="Save Directory", |
|
placeholder="Path to save intermediate files (optional)", |
|
) |
|
page_nums = gr.Textbox( |
|
label="Page Numbers", |
|
placeholder="Comma-separated page numbers (e.g., 1,3,5)", |
|
) |
|
router_priority = gr.Dropdown( |
|
choices=["speed", "accuracy"], value="accuracy", label="Router Priority" |
|
) |
|
temperature = gr.Number(label="LLM Temperature", value=None) |
|
depth = gr.Number(label="Recursive Depth", value=None) |
|
|
|
|
|
with gr.Row(): |
|
google_api_key = gr.Textbox( |
|
label="Google API Key", placeholder="Enter Google API Key" |
|
) |
|
openai_api_key = gr.Textbox( |
|
label="OpenAI API Key", placeholder="Enter OpenAI API Key" |
|
) |
|
huggingfacehub_api_token = gr.Textbox( |
|
label="HuggingFaceHub API Token", |
|
placeholder="Enter HuggingFaceHub API Token", |
|
) |
|
together_api_key = gr.Textbox( |
|
label="Together API Key", placeholder="Enter Together API Key" |
|
) |
|
openrouter_api_key = gr.Textbox( |
|
label="OpenRouter API Key", placeholder="Enter OpenRouter API Key" |
|
) |
|
|
|
output = gr.Markdown(label="Parsed Output") |
|
|
|
parse_button = gr.Button("Parse Document") |
|
parse_button.click( |
|
fn=run_parser, |
|
inputs=[ |
|
file_input, |
|
parser_type, |
|
model_input, |
|
pages_per_split, |
|
max_processes, |
|
as_pdf, |
|
x_tolerance, |
|
y_tolerance, |
|
save_dir, |
|
page_nums, |
|
router_priority, |
|
framework, |
|
temperature, |
|
depth, |
|
google_api_key, |
|
openai_api_key, |
|
huggingfacehub_api_token, |
|
together_api_key, |
|
openrouter_api_key, |
|
], |
|
outputs=output, |
|
) |
|
|
|
|
|
df = pd.read_csv("leaderboard.csv") |
|
leaderboard = gr.Dataframe( |
|
value=df, |
|
label="Leaderboard", |
|
) |
|
|
|
app.launch() |
|
|