arjunanand13's picture
Update app.py
91a9da3 verified
import gradio as gr
import json
import asyncio
import os
from main import StructuredExtractionSystem
import time
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable is required")
system = StructuredExtractionSystem(api_key)
async def extract_data(content, schema_text, progress=gr.Progress()):
try:
progress(0.1, desc="Parsing schema...")
schema = json.loads(schema_text)
progress(0.5, desc="Processing...")
result = await system.extract_structured_data(content, schema)
extracted_data = json.dumps(result["data"], indent=2)
metadata = result["extraction_metadata"]
total_expected = len(schema.get('properties', {}))
extracted_count = len([k for k, v in result["data"].items() if v is not None and v != ""])
completeness = extracted_count / total_expected if total_expected > 0 else 0
analysis = f"""Fields Extracted: {extracted_count}/{total_expected} ({completeness:.1%})
Complexity Tier: {metadata['complexity_tier']}
Processing Stages: {metadata['stages_executed']}
Processing Time: {metadata['actual_processing_time']:.2f}s
Schema Compliance: {metadata['schema_compliance']:.1%}"""
status_flags = result.get("review_flags", [])
if completeness >= 0.8 and not any(flag in ["incomplete_extraction", "low_quality", "schema_violations"] for flag in status_flags):
status = "Success"
elif completeness >= 0.5:
status = "Partial Success"
else:
status = "Incomplete"
if status_flags:
analysis += f"\nIssues: {', '.join(status_flags)}"
progress(1.0, desc="Complete")
return extracted_data, analysis, status
except json.JSONDecodeError as e:
return "", f"Invalid JSON Schema: {str(e)}", "Schema Error"
except Exception as e:
return "", f"Extraction Error: {str(e)}", "Error"
def extract_wrapper(content, schema_text):
return asyncio.run(extract_data(content, schema_text))
github_schema = """{
"type": "object",
"properties": {
"name": {"type": "string"},
"description": {"type": "string"},
"author": {"type": "string"},
"inputs": {
"type": "object",
"patternProperties": {
"^[_a-zA-Z][a-zA-Z0-9_-]*$": {
"type": "object",
"properties": {
"description": {"type": "string"},
"required": {"type": "boolean"},
"default": {"type": "string"}
}
}
}
},
"outputs": {
"type": "object",
"patternProperties": {
"^[_a-zA-Z][a-zA-Z0-9_-]*$": {
"type": "object",
"properties": {
"description": {"type": "string"},
"value": {"type": "string"}
}
}
}
},
"runs": {
"type": "object",
"properties": {
"using": {"type": "string"},
"steps": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"uses": {"type": "string"},
"run": {"type": "string"},
"shell": {"type": "string"}
}
}
}
}
},
"branding": {
"type": "object",
"properties": {
"color": {"type": "string"},
"icon": {"type": "string"}
}
}
},
"required": ["name", "description", "runs"]
}"""
github_content = """MkDocs Publisher Action
I keep repeating the steps to build and deploy our MkDocs documentation sites to GitHub Pages across different repos. Let's create a reusable composite action to handle this.
Action Name: MkDocs Publisher
Purpose: A simple action to build an MkDocs site and push it to the gh-pages branch. Should be easy to use. Author should be listed as 'DevRel Team'.
Inputs Needed:
python-version: We need to specify the Python version for setting up the environment. Users should be able to choose. Let's make this optional and default it to 3.11. Description: 'The version of Python to set up for building.'
requirements-file: Users might have different requirements files (e.g., requirements.txt, docs/requirements.txt). This input should specify the path. It's required. Description: 'Path to the Python requirements file'.
gh-token: The GitHub token for pushing to gh-pages. This is absolutely required. Description: 'GitHub token for deployment.'
Outputs:
The action needs to output the URL where the site was deployed. Let's call this output page-url. Its description should be 'The URL of the deployed GitHub Pages site.'
How it Runs:
This will be a composite action (using: composite). Here are the steps involved:
Checkout Code: First, we need the repository code. Use the standard actions/checkout@v4.
Setup Python: Next, set up the Python environment. Use actions/setup-python@v5.
Install Dependencies: Run a command to install the Python packages. The command is pip install -r requirements.txt. Execute this using the bash shell.
Build Site: Run the command mkdocs build. Use bash for this too.
Deploy to Pages: Use peaceiris/actions-gh-pages@v3 for deployment.
Branding: For the marketplace look, let's use the color blue and the book-open icon."""
resume_schema = """{
"type": "object",
"properties": {
"basics": {
"type": "object",
"properties": {
"name": {"type": "string"},
"label": {"type": "string"},
"email": {"type": "string"},
"phone": {"type": "string"},
"url": {"type": "string"},
"summary": {"type": "string"},
"location": {
"type": "object",
"properties": {
"address": {"type": "string"},
"postalCode": {"type": "string"},
"city": {"type": "string"},
"countryCode": {"type": "string"},
"region": {"type": "string"}
}
},
"profiles": {
"type": "array",
"items": {
"type": "object",
"properties": {
"network": {"type": "string"},
"username": {"type": "string"},
"url": {"type": "string"}
}
}
}
}
},
"work": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"location": {"type": "string"},
"position": {"type": "string"},
"startDate": {"type": "string"},
"endDate": {"type": "string"},
"highlights": {
"type": "array",
"items": {"type": "string"}
}
}
}
},
"education": {
"type": "array",
"items": {
"type": "object",
"properties": {
"institution": {"type": "string"},
"area": {"type": "string"},
"studyType": {"type": "string"},
"startDate": {"type": "string"},
"endDate": {"type": "string"}
}
}
},
"skills": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"keywords": {
"type": "array",
"items": {"type": "string"}
}
}
}
}
}
}"""
resume_content = """John Doe
Software Engineer
Email: [email protected]
Phone: +1-555-0123
Address: 123 Main St, San Francisco, CA 94105, US
Website: https://johndoe.dev
PROFESSIONAL SUMMARY
Experienced software engineer with 5+ years developing web applications and distributed systems. Skilled in full-stack development with expertise in Python, JavaScript, and cloud technologies.
WORK EXPERIENCE
Senior Software Engineer | TechCorp Inc | San Francisco, CA | 2022 - Present
- Designed and implemented microservices architecture serving 1M+ users
- Led development of real-time data processing pipeline using Apache Kafka
- Reduced system latency by 40% through performance optimization
Software Engineer | StartupXYZ | Palo Alto, CA | 2020 - 2022
- Built responsive web applications using React and Node.js
- Implemented CI/CD pipelines resulting in 50% faster deployment cycles
- Collaborated with cross-functional teams on product development
EDUCATION
Bachelor of Science in Computer Science | Stanford University | 2016 - 2020
- Relevant Coursework: Data Structures, Algorithms, Database Systems
- Senior Project: Machine Learning Platform for Predictive Analytics
TECHNICAL SKILLS
Programming Languages: Python, JavaScript, Java, Go, SQL
Web Technologies: React, Node.js, HTML/CSS, REST APIs, GraphQL
Cloud & DevOps: AWS, Docker, Kubernetes, Jenkins, Git"""
citation_schema = """{
"type": "object",
"properties": {
"cff-version": {"type": "string"},
"message": {"type": "string"},
"title": {"type": "string"},
"authors": {
"type": "array",
"items": {
"type": "object",
"properties": {
"given-names": {"type": "string"},
"family-names": {"type": "string"},
"affiliation": {"type": "string"},
"orcid": {"type": "string"}
}
}
},
"type": {"type": "string"},
"date-published": {"type": "string"},
"url": {"type": "string"},
"abstract": {"type": "string"},
"keywords": {
"type": "array",
"items": {"type": "string"}
},
"preferred-citation": {
"type": "object",
"properties": {
"type": {"type": "string"},
"title": {"type": "string"},
"authors": {
"type": "array",
"items": {
"type": "object",
"properties": {
"given-names": {"type": "string"},
"family-names": {"type": "string"}
}
}
},
"collection-title": {"type": "string"},
"volume": {"type": "integer"},
"year": {"type": "integer"},
"publisher": {
"type": "object",
"properties": {
"name": {"type": "string"}
}
}
}
}
}
}"""
citation_content = """Title: Attention Is All You Need
Authors: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Łukasz Kaiser, Illia Polosukhin
This paper introduces the Transformer, a novel neural network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.
Published in: Advances in Neural Information Processing Systems, Volume 30, 2017
Publisher: Curran Associates, Inc.
URL: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
The Transformer uses multi-head self-attention to draw global dependencies between input and output. The model achieves state-of-the-art results on machine translation tasks while being more parallelizable and requiring significantly less time to train.
Keywords: attention mechanism, transformer, neural networks, machine translation, self-attention
This work has become foundational for modern NLP models including BERT, GPT, and T5."""
contract_schema = """{
"type": "object",
"properties": {
"contract_type": {"type": "string"},
"parties": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"type": {"type": "string"},
"address": {"type": "string"}
}
}
},
"contract_value": {"type": "string"},
"payment_terms": {"type": "string"},
"duration": {"type": "string"},
"start_date": {"type": "string"},
"deliverables": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"deadline": {"type": "string"},
"description": {"type": "string"}
}
}
},
"key_terms": {
"type": "object",
"properties": {
"liability_cap": {"type": "string"},
"termination_notice": {"type": "string"},
"intellectual_property": {"type": "string"}
}
}
}
}"""
contract_content = """SOFTWARE DEVELOPMENT AGREEMENT
This Agreement is made between:
Client: TechCorp Inc., 123 Business Ave, San Francisco, CA 94105
Contractor: DevStudio LLC, 456 Developer St, Austin, TX 78701
Contract Value: $150,000
Payment Terms: 50% upfront, 50% upon completion
Duration: 6 months
Start Date: January 1, 2024
DELIVERABLES:
1. Web Application Development
- Complete e-commerce platform with user authentication
- Deadline: March 15, 2024
2. Mobile App Development
- iOS and Android applications
- Deadline: May 1, 2024
3. API Integration
- Third-party payment processing integration
- Deadline: April 15, 2024
KEY TERMS:
- Liability is capped at the total contract value
- Either party may terminate with 30 days written notice
- All intellectual property developed under this agreement belongs to the Client
- Contractor agrees to maintain confidentiality of all proprietary information"""
with gr.Blocks(title="Unstructured to Structured Converter", theme=gr.themes.Default()) as app:
gr.Markdown("# Unstructured to Structured JSON Converter")
gr.Markdown("Convert any unstructured text into structured JSON following complex schemas")
with gr.Row():
with gr.Column():
gr.Markdown("### Input Content")
content_input = gr.Textbox(
label="Document Content",
placeholder="Enter your unstructured text here...",
lines=12,
max_lines=20
)
with gr.Column():
gr.Markdown("### JSON Schema")
schema_input = gr.Textbox(
label="Target Schema",
placeholder="Enter your JSON schema here...",
lines=12,
max_lines=20,
value=github_schema
)
with gr.Row():
extract_btn = gr.Button("Extract Data", variant="primary")
clear_btn = gr.Button("Clear")
with gr.Row():
with gr.Column(scale=2):
gr.Markdown("### Extracted Data")
output_json = gr.Textbox(
label="JSON Output",
lines=15,
show_copy_button=True
)
with gr.Column(scale=1):
gr.Markdown("### Results")
metadata_output = gr.Textbox(label="Analysis", lines=8)
status_output = gr.Textbox(label="Status")
gr.Markdown("### Test Cases")
gr.Examples(
examples=[
[github_content, github_schema],
[resume_content, resume_schema],
[citation_content, citation_schema],
[contract_content, contract_schema]
],
inputs=[content_input, schema_input],
label="Select a test case:"
)
gr.Markdown("""
### System Features
- **Schema Complexity**: Supports 6+ levels nesting, 250+ fields, unlimited enums
- **Document Size**: Handles 50+ page documents and 10MB+ files
- **Dynamic Scaling**: Cost ranges from $0.01 to $5.00 based on complexity
- **Quality Assurance**: Confidence scoring with human review routing
""")
extract_btn.click(
fn=extract_wrapper,
inputs=[content_input, schema_input],
outputs=[output_json, metadata_output, status_output]
)
clear_btn.click(
lambda: ("", "", "", "", ""),
outputs=[content_input, schema_input, output_json, metadata_output, status_output]
)
if __name__ == "__main__":
app.launch()