import gradio as gr import json import asyncio import os from main import StructuredExtractionSystem import time api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError("OPENAI_API_KEY environment variable is required") system = StructuredExtractionSystem(api_key) async def extract_data(content, schema_text, progress=gr.Progress()): try: progress(0.1, desc="Parsing schema...") schema = json.loads(schema_text) progress(0.5, desc="Processing...") result = await system.extract_structured_data(content, schema) extracted_data = json.dumps(result["data"], indent=2) metadata = result["extraction_metadata"] total_expected = len(schema.get('properties', {})) extracted_count = len([k for k, v in result["data"].items() if v is not None and v != ""]) completeness = extracted_count / total_expected if total_expected > 0 else 0 analysis = f"""Fields Extracted: {extracted_count}/{total_expected} ({completeness:.1%}) Complexity Tier: {metadata['complexity_tier']} Processing Stages: {metadata['stages_executed']} Processing Time: {metadata['actual_processing_time']:.2f}s Schema Compliance: {metadata['schema_compliance']:.1%}""" status_flags = result.get("review_flags", []) if completeness >= 0.8 and not any(flag in ["incomplete_extraction", "low_quality", "schema_violations"] for flag in status_flags): status = "Success" elif completeness >= 0.5: status = "Partial Success" else: status = "Incomplete" if status_flags: analysis += f"\nIssues: {', '.join(status_flags)}" progress(1.0, desc="Complete") return extracted_data, analysis, status except json.JSONDecodeError as e: return "", f"Invalid JSON Schema: {str(e)}", "Schema Error" except Exception as e: return "", f"Extraction Error: {str(e)}", "Error" def extract_wrapper(content, schema_text): return asyncio.run(extract_data(content, schema_text)) github_schema = """{ "type": "object", "properties": { "name": {"type": "string"}, "description": {"type": "string"}, "author": {"type": "string"}, "inputs": { "type": "object", "patternProperties": { "^[_a-zA-Z][a-zA-Z0-9_-]*$": { "type": "object", "properties": { "description": {"type": "string"}, "required": {"type": "boolean"}, "default": {"type": "string"} } } } }, "outputs": { "type": "object", "patternProperties": { "^[_a-zA-Z][a-zA-Z0-9_-]*$": { "type": "object", "properties": { "description": {"type": "string"}, "value": {"type": "string"} } } } }, "runs": { "type": "object", "properties": { "using": {"type": "string"}, "steps": { "type": "array", "items": { "type": "object", "properties": { "name": {"type": "string"}, "uses": {"type": "string"}, "run": {"type": "string"}, "shell": {"type": "string"} } } } } }, "branding": { "type": "object", "properties": { "color": {"type": "string"}, "icon": {"type": "string"} } } }, "required": ["name", "description", "runs"] }""" github_content = """MkDocs Publisher Action I keep repeating the steps to build and deploy our MkDocs documentation sites to GitHub Pages across different repos. Let's create a reusable composite action to handle this. Action Name: MkDocs Publisher Purpose: A simple action to build an MkDocs site and push it to the gh-pages branch. Should be easy to use. Author should be listed as 'DevRel Team'. Inputs Needed: python-version: We need to specify the Python version for setting up the environment. Users should be able to choose. Let's make this optional and default it to 3.11. Description: 'The version of Python to set up for building.' requirements-file: Users might have different requirements files (e.g., requirements.txt, docs/requirements.txt). This input should specify the path. It's required. Description: 'Path to the Python requirements file'. gh-token: The GitHub token for pushing to gh-pages. This is absolutely required. Description: 'GitHub token for deployment.' Outputs: The action needs to output the URL where the site was deployed. Let's call this output page-url. Its description should be 'The URL of the deployed GitHub Pages site.' How it Runs: This will be a composite action (using: composite). Here are the steps involved: Checkout Code: First, we need the repository code. Use the standard actions/checkout@v4. Setup Python: Next, set up the Python environment. Use actions/setup-python@v5. Install Dependencies: Run a command to install the Python packages. The command is pip install -r requirements.txt. Execute this using the bash shell. Build Site: Run the command mkdocs build. Use bash for this too. Deploy to Pages: Use peaceiris/actions-gh-pages@v3 for deployment. Branding: For the marketplace look, let's use the color blue and the book-open icon.""" resume_schema = """{ "type": "object", "properties": { "basics": { "type": "object", "properties": { "name": {"type": "string"}, "label": {"type": "string"}, "email": {"type": "string"}, "phone": {"type": "string"}, "url": {"type": "string"}, "summary": {"type": "string"}, "location": { "type": "object", "properties": { "address": {"type": "string"}, "postalCode": {"type": "string"}, "city": {"type": "string"}, "countryCode": {"type": "string"}, "region": {"type": "string"} } }, "profiles": { "type": "array", "items": { "type": "object", "properties": { "network": {"type": "string"}, "username": {"type": "string"}, "url": {"type": "string"} } } } } }, "work": { "type": "array", "items": { "type": "object", "properties": { "name": {"type": "string"}, "location": {"type": "string"}, "position": {"type": "string"}, "startDate": {"type": "string"}, "endDate": {"type": "string"}, "highlights": { "type": "array", "items": {"type": "string"} } } } }, "education": { "type": "array", "items": { "type": "object", "properties": { "institution": {"type": "string"}, "area": {"type": "string"}, "studyType": {"type": "string"}, "startDate": {"type": "string"}, "endDate": {"type": "string"} } } }, "skills": { "type": "array", "items": { "type": "object", "properties": { "name": {"type": "string"}, "keywords": { "type": "array", "items": {"type": "string"} } } } } } }""" resume_content = """John Doe Software Engineer Email: john.doe@email.com Phone: +1-555-0123 Address: 123 Main St, San Francisco, CA 94105, US Website: https://johndoe.dev PROFESSIONAL SUMMARY Experienced software engineer with 5+ years developing web applications and distributed systems. Skilled in full-stack development with expertise in Python, JavaScript, and cloud technologies. WORK EXPERIENCE Senior Software Engineer | TechCorp Inc | San Francisco, CA | 2022 - Present - Designed and implemented microservices architecture serving 1M+ users - Led development of real-time data processing pipeline using Apache Kafka - Reduced system latency by 40% through performance optimization Software Engineer | StartupXYZ | Palo Alto, CA | 2020 - 2022 - Built responsive web applications using React and Node.js - Implemented CI/CD pipelines resulting in 50% faster deployment cycles - Collaborated with cross-functional teams on product development EDUCATION Bachelor of Science in Computer Science | Stanford University | 2016 - 2020 - Relevant Coursework: Data Structures, Algorithms, Database Systems - Senior Project: Machine Learning Platform for Predictive Analytics TECHNICAL SKILLS Programming Languages: Python, JavaScript, Java, Go, SQL Web Technologies: React, Node.js, HTML/CSS, REST APIs, GraphQL Cloud & DevOps: AWS, Docker, Kubernetes, Jenkins, Git""" citation_schema = """{ "type": "object", "properties": { "cff-version": {"type": "string"}, "message": {"type": "string"}, "title": {"type": "string"}, "authors": { "type": "array", "items": { "type": "object", "properties": { "given-names": {"type": "string"}, "family-names": {"type": "string"}, "affiliation": {"type": "string"}, "orcid": {"type": "string"} } } }, "type": {"type": "string"}, "date-published": {"type": "string"}, "url": {"type": "string"}, "abstract": {"type": "string"}, "keywords": { "type": "array", "items": {"type": "string"} }, "preferred-citation": { "type": "object", "properties": { "type": {"type": "string"}, "title": {"type": "string"}, "authors": { "type": "array", "items": { "type": "object", "properties": { "given-names": {"type": "string"}, "family-names": {"type": "string"} } } }, "collection-title": {"type": "string"}, "volume": {"type": "integer"}, "year": {"type": "integer"}, "publisher": { "type": "object", "properties": { "name": {"type": "string"} } } } } } }""" citation_content = """Title: Attention Is All You Need Authors: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ɓukasz Kaiser, Illia Polosukhin This paper introduces the Transformer, a novel neural network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Published in: Advances in Neural Information Processing Systems, Volume 30, 2017 Publisher: Curran Associates, Inc. URL: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf The Transformer uses multi-head self-attention to draw global dependencies between input and output. The model achieves state-of-the-art results on machine translation tasks while being more parallelizable and requiring significantly less time to train. Keywords: attention mechanism, transformer, neural networks, machine translation, self-attention This work has become foundational for modern NLP models including BERT, GPT, and T5.""" contract_schema = """{ "type": "object", "properties": { "contract_type": {"type": "string"}, "parties": { "type": "array", "items": { "type": "object", "properties": { "name": {"type": "string"}, "type": {"type": "string"}, "address": {"type": "string"} } } }, "contract_value": {"type": "string"}, "payment_terms": {"type": "string"}, "duration": {"type": "string"}, "start_date": {"type": "string"}, "deliverables": { "type": "array", "items": { "type": "object", "properties": { "name": {"type": "string"}, "deadline": {"type": "string"}, "description": {"type": "string"} } } }, "key_terms": { "type": "object", "properties": { "liability_cap": {"type": "string"}, "termination_notice": {"type": "string"}, "intellectual_property": {"type": "string"} } } } }""" contract_content = """SOFTWARE DEVELOPMENT AGREEMENT This Agreement is made between: Client: TechCorp Inc., 123 Business Ave, San Francisco, CA 94105 Contractor: DevStudio LLC, 456 Developer St, Austin, TX 78701 Contract Value: $150,000 Payment Terms: 50% upfront, 50% upon completion Duration: 6 months Start Date: January 1, 2024 DELIVERABLES: 1. Web Application Development - Complete e-commerce platform with user authentication - Deadline: March 15, 2024 2. Mobile App Development - iOS and Android applications - Deadline: May 1, 2024 3. API Integration - Third-party payment processing integration - Deadline: April 15, 2024 KEY TERMS: - Liability is capped at the total contract value - Either party may terminate with 30 days written notice - All intellectual property developed under this agreement belongs to the Client - Contractor agrees to maintain confidentiality of all proprietary information""" with gr.Blocks(title="Unstructured to Structured Converter", theme=gr.themes.Default()) as app: gr.Markdown("# Unstructured to Structured JSON Converter") gr.Markdown("Convert any unstructured text into structured JSON following complex schemas") with gr.Row(): with gr.Column(): gr.Markdown("### Input Content") content_input = gr.Textbox( label="Document Content", placeholder="Enter your unstructured text here...", lines=12, max_lines=20 ) with gr.Column(): gr.Markdown("### JSON Schema") schema_input = gr.Textbox( label="Target Schema", placeholder="Enter your JSON schema here...", lines=12, max_lines=20, value=github_schema ) with gr.Row(): extract_btn = gr.Button("Extract Data", variant="primary") clear_btn = gr.Button("Clear") with gr.Row(): with gr.Column(scale=2): gr.Markdown("### Extracted Data") output_json = gr.Textbox( label="JSON Output", lines=15, show_copy_button=True ) with gr.Column(scale=1): gr.Markdown("### Results") metadata_output = gr.Textbox(label="Analysis", lines=8) status_output = gr.Textbox(label="Status") gr.Markdown("### Test Cases") gr.Examples( examples=[ [github_content, github_schema], [resume_content, resume_schema], [citation_content, citation_schema], [contract_content, contract_schema] ], inputs=[content_input, schema_input], label="Select a test case:" ) gr.Markdown(""" ### System Features - **Schema Complexity**: Supports 6+ levels nesting, 250+ fields, unlimited enums - **Document Size**: Handles 50+ page documents and 10MB+ files - **Dynamic Scaling**: Cost ranges from $0.01 to $5.00 based on complexity - **Quality Assurance**: Confidence scoring with human review routing """) extract_btn.click( fn=extract_wrapper, inputs=[content_input, schema_input], outputs=[output_json, metadata_output, status_output] ) clear_btn.click( lambda: ("", "", "", "", ""), outputs=[content_input, schema_input, output_json, metadata_output, status_output] ) if __name__ == "__main__": app.launch()