|
import gradio as gr |
|
import json |
|
import asyncio |
|
import os |
|
from main import StructuredExtractionSystem |
|
import time |
|
|
|
api_key = os.getenv("OPENAI_API_KEY") |
|
if not api_key: |
|
raise ValueError("OPENAI_API_KEY environment variable is required") |
|
|
|
system = StructuredExtractionSystem(api_key) |
|
|
|
async def extract_data(content, schema_text, progress=gr.Progress()): |
|
try: |
|
progress(0.1, desc="Parsing schema...") |
|
schema = json.loads(schema_text) |
|
|
|
progress(0.5, desc="Processing...") |
|
result = await system.extract_structured_data(content, schema) |
|
|
|
extracted_data = json.dumps(result["data"], indent=2) |
|
metadata = result["extraction_metadata"] |
|
|
|
total_expected = len(schema.get('properties', {})) |
|
extracted_count = len([k for k, v in result["data"].items() if v is not None and v != ""]) |
|
completeness = extracted_count / total_expected if total_expected > 0 else 0 |
|
|
|
analysis = f"""Fields Extracted: {extracted_count}/{total_expected} ({completeness:.1%}) |
|
Complexity Tier: {metadata['complexity_tier']} |
|
Processing Stages: {metadata['stages_executed']} |
|
Processing Time: {metadata['actual_processing_time']:.2f}s |
|
Schema Compliance: {metadata['schema_compliance']:.1%}""" |
|
|
|
status_flags = result.get("review_flags", []) |
|
|
|
if completeness >= 0.8 and not any(flag in ["incomplete_extraction", "low_quality", "schema_violations"] for flag in status_flags): |
|
status = "Success" |
|
elif completeness >= 0.5: |
|
status = "Partial Success" |
|
else: |
|
status = "Incomplete" |
|
|
|
if status_flags: |
|
analysis += f"\nIssues: {', '.join(status_flags)}" |
|
|
|
progress(1.0, desc="Complete") |
|
return extracted_data, analysis, status |
|
|
|
except json.JSONDecodeError as e: |
|
return "", f"Invalid JSON Schema: {str(e)}", "Schema Error" |
|
except Exception as e: |
|
return "", f"Extraction Error: {str(e)}", "Error" |
|
|
|
def extract_wrapper(content, schema_text): |
|
return asyncio.run(extract_data(content, schema_text)) |
|
|
|
github_schema = """{ |
|
"type": "object", |
|
"properties": { |
|
"name": {"type": "string"}, |
|
"description": {"type": "string"}, |
|
"author": {"type": "string"}, |
|
"inputs": { |
|
"type": "object", |
|
"patternProperties": { |
|
"^[_a-zA-Z][a-zA-Z0-9_-]*$": { |
|
"type": "object", |
|
"properties": { |
|
"description": {"type": "string"}, |
|
"required": {"type": "boolean"}, |
|
"default": {"type": "string"} |
|
} |
|
} |
|
} |
|
}, |
|
"outputs": { |
|
"type": "object", |
|
"patternProperties": { |
|
"^[_a-zA-Z][a-zA-Z0-9_-]*$": { |
|
"type": "object", |
|
"properties": { |
|
"description": {"type": "string"}, |
|
"value": {"type": "string"} |
|
} |
|
} |
|
} |
|
}, |
|
"runs": { |
|
"type": "object", |
|
"properties": { |
|
"using": {"type": "string"}, |
|
"steps": { |
|
"type": "array", |
|
"items": { |
|
"type": "object", |
|
"properties": { |
|
"name": {"type": "string"}, |
|
"uses": {"type": "string"}, |
|
"run": {"type": "string"}, |
|
"shell": {"type": "string"} |
|
} |
|
} |
|
} |
|
} |
|
}, |
|
"branding": { |
|
"type": "object", |
|
"properties": { |
|
"color": {"type": "string"}, |
|
"icon": {"type": "string"} |
|
} |
|
} |
|
}, |
|
"required": ["name", "description", "runs"] |
|
}""" |
|
|
|
github_content = """MkDocs Publisher Action |
|
|
|
I keep repeating the steps to build and deploy our MkDocs documentation sites to GitHub Pages across different repos. Let's create a reusable composite action to handle this. |
|
|
|
Action Name: MkDocs Publisher |
|
Purpose: A simple action to build an MkDocs site and push it to the gh-pages branch. Should be easy to use. Author should be listed as 'DevRel Team'. |
|
|
|
Inputs Needed: |
|
python-version: We need to specify the Python version for setting up the environment. Users should be able to choose. Let's make this optional and default it to 3.11. Description: 'The version of Python to set up for building.' |
|
requirements-file: Users might have different requirements files (e.g., requirements.txt, docs/requirements.txt). This input should specify the path. It's required. Description: 'Path to the Python requirements file'. |
|
gh-token: The GitHub token for pushing to gh-pages. This is absolutely required. Description: 'GitHub token for deployment.' |
|
|
|
Outputs: |
|
The action needs to output the URL where the site was deployed. Let's call this output page-url. Its description should be 'The URL of the deployed GitHub Pages site.' |
|
|
|
How it Runs: |
|
This will be a composite action (using: composite). Here are the steps involved: |
|
Checkout Code: First, we need the repository code. Use the standard actions/checkout@v4. |
|
Setup Python: Next, set up the Python environment. Use actions/setup-python@v5. |
|
Install Dependencies: Run a command to install the Python packages. The command is pip install -r requirements.txt. Execute this using the bash shell. |
|
Build Site: Run the command mkdocs build. Use bash for this too. |
|
Deploy to Pages: Use peaceiris/actions-gh-pages@v3 for deployment. |
|
|
|
Branding: For the marketplace look, let's use the color blue and the book-open icon.""" |
|
|
|
resume_schema = """{ |
|
"type": "object", |
|
"properties": { |
|
"basics": { |
|
"type": "object", |
|
"properties": { |
|
"name": {"type": "string"}, |
|
"label": {"type": "string"}, |
|
"email": {"type": "string"}, |
|
"phone": {"type": "string"}, |
|
"url": {"type": "string"}, |
|
"summary": {"type": "string"}, |
|
"location": { |
|
"type": "object", |
|
"properties": { |
|
"address": {"type": "string"}, |
|
"postalCode": {"type": "string"}, |
|
"city": {"type": "string"}, |
|
"countryCode": {"type": "string"}, |
|
"region": {"type": "string"} |
|
} |
|
}, |
|
"profiles": { |
|
"type": "array", |
|
"items": { |
|
"type": "object", |
|
"properties": { |
|
"network": {"type": "string"}, |
|
"username": {"type": "string"}, |
|
"url": {"type": "string"} |
|
} |
|
} |
|
} |
|
} |
|
}, |
|
"work": { |
|
"type": "array", |
|
"items": { |
|
"type": "object", |
|
"properties": { |
|
"name": {"type": "string"}, |
|
"location": {"type": "string"}, |
|
"position": {"type": "string"}, |
|
"startDate": {"type": "string"}, |
|
"endDate": {"type": "string"}, |
|
"highlights": { |
|
"type": "array", |
|
"items": {"type": "string"} |
|
} |
|
} |
|
} |
|
}, |
|
"education": { |
|
"type": "array", |
|
"items": { |
|
"type": "object", |
|
"properties": { |
|
"institution": {"type": "string"}, |
|
"area": {"type": "string"}, |
|
"studyType": {"type": "string"}, |
|
"startDate": {"type": "string"}, |
|
"endDate": {"type": "string"} |
|
} |
|
} |
|
}, |
|
"skills": { |
|
"type": "array", |
|
"items": { |
|
"type": "object", |
|
"properties": { |
|
"name": {"type": "string"}, |
|
"keywords": { |
|
"type": "array", |
|
"items": {"type": "string"} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
}""" |
|
|
|
resume_content = """John Doe |
|
Software Engineer |
|
Email: [email protected] |
|
Phone: +1-555-0123 |
|
Address: 123 Main St, San Francisco, CA 94105, US |
|
Website: https://johndoe.dev |
|
|
|
PROFESSIONAL SUMMARY |
|
Experienced software engineer with 5+ years developing web applications and distributed systems. Skilled in full-stack development with expertise in Python, JavaScript, and cloud technologies. |
|
|
|
WORK EXPERIENCE |
|
|
|
Senior Software Engineer | TechCorp Inc | San Francisco, CA | 2022 - Present |
|
- Designed and implemented microservices architecture serving 1M+ users |
|
- Led development of real-time data processing pipeline using Apache Kafka |
|
- Reduced system latency by 40% through performance optimization |
|
|
|
Software Engineer | StartupXYZ | Palo Alto, CA | 2020 - 2022 |
|
- Built responsive web applications using React and Node.js |
|
- Implemented CI/CD pipelines resulting in 50% faster deployment cycles |
|
- Collaborated with cross-functional teams on product development |
|
|
|
EDUCATION |
|
|
|
Bachelor of Science in Computer Science | Stanford University | 2016 - 2020 |
|
- Relevant Coursework: Data Structures, Algorithms, Database Systems |
|
- Senior Project: Machine Learning Platform for Predictive Analytics |
|
|
|
TECHNICAL SKILLS |
|
|
|
Programming Languages: Python, JavaScript, Java, Go, SQL |
|
Web Technologies: React, Node.js, HTML/CSS, REST APIs, GraphQL |
|
Cloud & DevOps: AWS, Docker, Kubernetes, Jenkins, Git""" |
|
|
|
citation_schema = """{ |
|
"type": "object", |
|
"properties": { |
|
"cff-version": {"type": "string"}, |
|
"message": {"type": "string"}, |
|
"title": {"type": "string"}, |
|
"authors": { |
|
"type": "array", |
|
"items": { |
|
"type": "object", |
|
"properties": { |
|
"given-names": {"type": "string"}, |
|
"family-names": {"type": "string"}, |
|
"affiliation": {"type": "string"}, |
|
"orcid": {"type": "string"} |
|
} |
|
} |
|
}, |
|
"type": {"type": "string"}, |
|
"date-published": {"type": "string"}, |
|
"url": {"type": "string"}, |
|
"abstract": {"type": "string"}, |
|
"keywords": { |
|
"type": "array", |
|
"items": {"type": "string"} |
|
}, |
|
"preferred-citation": { |
|
"type": "object", |
|
"properties": { |
|
"type": {"type": "string"}, |
|
"title": {"type": "string"}, |
|
"authors": { |
|
"type": "array", |
|
"items": { |
|
"type": "object", |
|
"properties": { |
|
"given-names": {"type": "string"}, |
|
"family-names": {"type": "string"} |
|
} |
|
} |
|
}, |
|
"collection-title": {"type": "string"}, |
|
"volume": {"type": "integer"}, |
|
"year": {"type": "integer"}, |
|
"publisher": { |
|
"type": "object", |
|
"properties": { |
|
"name": {"type": "string"} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
}""" |
|
|
|
citation_content = """Title: Attention Is All You Need |
|
Authors: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Łukasz Kaiser, Illia Polosukhin |
|
|
|
This paper introduces the Transformer, a novel neural network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. |
|
|
|
Published in: Advances in Neural Information Processing Systems, Volume 30, 2017 |
|
Publisher: Curran Associates, Inc. |
|
URL: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf |
|
|
|
The Transformer uses multi-head self-attention to draw global dependencies between input and output. The model achieves state-of-the-art results on machine translation tasks while being more parallelizable and requiring significantly less time to train. |
|
|
|
Keywords: attention mechanism, transformer, neural networks, machine translation, self-attention |
|
|
|
This work has become foundational for modern NLP models including BERT, GPT, and T5.""" |
|
|
|
contract_schema = """{ |
|
"type": "object", |
|
"properties": { |
|
"contract_type": {"type": "string"}, |
|
"parties": { |
|
"type": "array", |
|
"items": { |
|
"type": "object", |
|
"properties": { |
|
"name": {"type": "string"}, |
|
"type": {"type": "string"}, |
|
"address": {"type": "string"} |
|
} |
|
} |
|
}, |
|
"contract_value": {"type": "string"}, |
|
"payment_terms": {"type": "string"}, |
|
"duration": {"type": "string"}, |
|
"start_date": {"type": "string"}, |
|
"deliverables": { |
|
"type": "array", |
|
"items": { |
|
"type": "object", |
|
"properties": { |
|
"name": {"type": "string"}, |
|
"deadline": {"type": "string"}, |
|
"description": {"type": "string"} |
|
} |
|
} |
|
}, |
|
"key_terms": { |
|
"type": "object", |
|
"properties": { |
|
"liability_cap": {"type": "string"}, |
|
"termination_notice": {"type": "string"}, |
|
"intellectual_property": {"type": "string"} |
|
} |
|
} |
|
} |
|
}""" |
|
|
|
contract_content = """SOFTWARE DEVELOPMENT AGREEMENT |
|
|
|
This Agreement is made between: |
|
Client: TechCorp Inc., 123 Business Ave, San Francisco, CA 94105 |
|
Contractor: DevStudio LLC, 456 Developer St, Austin, TX 78701 |
|
|
|
Contract Value: $150,000 |
|
Payment Terms: 50% upfront, 50% upon completion |
|
Duration: 6 months |
|
Start Date: January 1, 2024 |
|
|
|
DELIVERABLES: |
|
1. Web Application Development |
|
- Complete e-commerce platform with user authentication |
|
- Deadline: March 15, 2024 |
|
|
|
2. Mobile App Development |
|
- iOS and Android applications |
|
- Deadline: May 1, 2024 |
|
|
|
3. API Integration |
|
- Third-party payment processing integration |
|
- Deadline: April 15, 2024 |
|
|
|
KEY TERMS: |
|
- Liability is capped at the total contract value |
|
- Either party may terminate with 30 days written notice |
|
- All intellectual property developed under this agreement belongs to the Client |
|
- Contractor agrees to maintain confidentiality of all proprietary information""" |
|
|
|
with gr.Blocks(title="Unstructured to Structured Converter", theme=gr.themes.Default()) as app: |
|
gr.Markdown("# Unstructured to Structured JSON Converter") |
|
gr.Markdown("Convert any unstructured text into structured JSON following complex schemas") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### Input Content") |
|
content_input = gr.Textbox( |
|
label="Document Content", |
|
placeholder="Enter your unstructured text here...", |
|
lines=12, |
|
max_lines=20 |
|
) |
|
|
|
with gr.Column(): |
|
gr.Markdown("### JSON Schema") |
|
schema_input = gr.Textbox( |
|
label="Target Schema", |
|
placeholder="Enter your JSON schema here...", |
|
lines=12, |
|
max_lines=20, |
|
value=github_schema |
|
) |
|
|
|
with gr.Row(): |
|
extract_btn = gr.Button("Extract Data", variant="primary") |
|
clear_btn = gr.Button("Clear") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.Markdown("### Extracted Data") |
|
output_json = gr.Textbox( |
|
label="JSON Output", |
|
lines=15, |
|
show_copy_button=True |
|
) |
|
|
|
with gr.Column(scale=1): |
|
gr.Markdown("### Results") |
|
metadata_output = gr.Textbox(label="Analysis", lines=8) |
|
status_output = gr.Textbox(label="Status") |
|
|
|
gr.Markdown("### Test Cases") |
|
gr.Examples( |
|
examples=[ |
|
[github_content, github_schema], |
|
[resume_content, resume_schema], |
|
[citation_content, citation_schema], |
|
[contract_content, contract_schema] |
|
], |
|
inputs=[content_input, schema_input], |
|
label="Select a test case:" |
|
) |
|
|
|
gr.Markdown(""" |
|
### System Features |
|
- **Schema Complexity**: Supports 6+ levels nesting, 250+ fields, unlimited enums |
|
- **Document Size**: Handles 50+ page documents and 10MB+ files |
|
- **Dynamic Scaling**: Cost ranges from $0.01 to $5.00 based on complexity |
|
- **Quality Assurance**: Confidence scoring with human review routing |
|
""") |
|
|
|
extract_btn.click( |
|
fn=extract_wrapper, |
|
inputs=[content_input, schema_input], |
|
outputs=[output_json, metadata_output, status_output] |
|
) |
|
|
|
clear_btn.click( |
|
lambda: ("", "", "", "", ""), |
|
outputs=[content_input, schema_input, output_json, metadata_output, status_output] |
|
) |
|
|
|
if __name__ == "__main__": |
|
app.launch() |