Spaces:

arjunanand13
/

unstructured-to-structured-converter

Sleeping

App Files Files Community

arjunanand13 commited on Jul 20

Commit

9943368

verified ·

1 Parent(s): 8be0d39

Create app.py

Browse files

Files changed (1) hide show

app.py +520 -0

app.py ADDED Viewed

	@@ -0,0 +1,520 @@

+import gradio as gr
+import json
+import asyncio
+import os
+from main import StructuredExtractionSystem
+import time
+api_key = os.getenv("OPENAI_API_KEY")
+if not api_key:
+    raise ValueError("OPENAI_API_KEY environment variable is required")
+system = StructuredExtractionSystem(api_key)
+async def extract_data(content, schema_text, progress=gr.Progress()):
+    try:
+        progress(0.1, desc="Parsing schema...")
+        schema = json.loads(schema_text)
+        progress(0.3, desc="Analyzing complexity...")
+        progress(0.5, desc="Extracting data...")
+        result = await system.extract_structured_data(content, schema)
+        progress(0.9, desc="Finalizing results...")
+        extracted_data = json.dumps(result["data"], indent=2)
+        confidence = f"{result['overall_confidence']:.1%}"
+        metadata = result["extraction_metadata"]
+        complexity_info = f"""
+**Schema Analysis:**
+- Complexity Tier: {metadata['complexity_tier']}
+- Processing Stages: {metadata['stages_executed']}
+- Estimated Cost: ${metadata['estimated_cost']:.3f}
+- Processing Time: {metadata['actual_processing_time']:.2f}s
+- Schema Compliance: {metadata['schema_compliance']:.1%}
+"""
+        review_info = ""
+        if result["review_flags"]:
+            review_info = f"\n**Review Required:** {', '.join(result['review_flags'])}"
+            review_info += f"\nEstimated Review Time: {metadata['recommended_review_time']} minutes"
+        progress(1.0, desc="Complete!")
+        return extracted_data, confidence, complexity_info + review_info, "✅ Success"
+    except json.JSONDecodeError as e:
+        return "", "0%", f"❌ Invalid JSON Schema: {str(e)}", "❌ Schema Error"
+    except Exception as e:
+        return "", "0%", f"❌ Extraction Error: {str(e)}", "❌ Error"
+def extract_wrapper(content, schema_text):
+    return asyncio.run(extract_data(content, schema_text))
+github_schema = """{
+  "type": "object",
+  "properties": {
+    "name": {"type": "string"},
+    "description": {"type": "string"},
+    "author": {"type": "string"},
+    "inputs": {
+      "type": "object",
+      "patternProperties": {
+        "^[_a-zA-Z][a-zA-Z0-9_-]*$": {
+          "type": "object",
+          "properties": {
+            "description": {"type": "string"},
+            "required": {"type": "boolean"},
+            "default": {"type": "string"}
+          }
+        }
+      }
+    },
+    "outputs": {
+      "type": "object",
+      "patternProperties": {
+        "^[_a-zA-Z][a-zA-Z0-9_-]*$": {
+          "type": "object",
+          "properties": {
+            "description": {"type": "string"},
+            "value": {"type": "string"}
+          }
+        }
+      }
+    },
+    "runs": {
+      "type": "object",
+      "properties": {
+        "using": {"type": "string"},
+        "steps": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "name": {"type": "string"},
+              "uses": {"type": "string"},
+              "run": {"type": "string"},
+              "shell": {"type": "string"}
+            }
+          }
+        }
+      }
+    },
+    "branding": {
+      "type": "object",
+      "properties": {
+        "color": {"type": "string"},
+        "icon": {"type": "string"}
+      }
+    }
+  },
+  "required": ["name", "description", "runs"]
+}"""
+github_content = """MkDocs Publisher Action
+This is a composite action that builds an MkDocs documentation site and deploys it to GitHub Pages.
+It's designed to be reusable across multiple repositories.
+Author: DevRel Team
+The action requires:
+- python-version: Python version to use (default: 3.11)
+- requirements-file: Path to requirements file (required)
+- gh-token: GitHub token for deployment (required)
+The action outputs the URL where the site was deployed.
+The action runs these steps:
+1. Checkout the repository code using actions/checkout@v4
+2. Setup Python environment using actions/setup-python@v5
+3. Install dependencies: pip install -r requirements.txt
+4. Build the MkDocs site: mkdocs build
+5. Deploy to GitHub Pages using peaceiris/actions-gh-pages@v3
+Branding: Use blue color with book-open icon."""
+resume_schema = """{
+  "type": "object",
+  "properties": {
+    "basics": {
+      "type": "object",
+      "properties": {
+        "name": {"type": "string"},
+        "label": {"type": "string"},
+        "email": {"type": "string"},
+        "phone": {"type": "string"},
+        "website": {"type": "string"},
+        "summary": {"type": "string"},
+        "location": {
+          "type": "object",
+          "properties": {
+            "city": {"type": "string"},
+            "region": {"type": "string"},
+            "countryCode": {"type": "string"}
+          }
+        }
+      }
+    },
+    "work": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "position": {"type": "string"},
+          "location": {"type": "string"},
+          "startDate": {"type": "string"},
+          "endDate": {"type": "string"},
+          "highlights": {
+            "type": "array",
+            "items": {"type": "string"}
+          }
+        }
+      }
+    },
+    "education": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "institution": {"type": "string"},
+          "area": {"type": "string"},
+          "studyType": {"type": "string"},
+          "startDate": {"type": "string"},
+          "endDate": {"type": "string"},
+          "score": {"type": "string"}
+        }
+      }
+    },
+    "skills": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "keywords": {
+            "type": "array",
+            "items": {"type": "string"}
+          }
+        }
+      }
+    }
+  }
+}"""
+resume_content = """Sarah Chen - Senior AI Research Scientist
+Email: [email protected] | Phone: +1-555-0123
+Location: Palo Alto, California, United States
+Website: https://sarahchen.ai
+SUMMARY
+Experienced AI research scientist with 8+ years in machine learning, deep learning, and natural language processing. Led teams that developed production ML systems serving millions of users.
+WORK EXPERIENCE
+Senior AI Research Scientist | OpenAI | 2021 - Present | San Francisco, CA
+• Led development of GPT-4 training infrastructure, improving training efficiency by 40%
+• Designed novel attention mechanisms for transformer architectures
+• Managed team of 12 researchers across multiple ML projects
+Machine Learning Engineer | Google Brain | 2019 - 2021 | Mountain View, CA
+• Developed recommendation systems serving 500M+ users daily
+• Implemented distributed training frameworks for large-scale models
+• Reduced model inference latency by 60% through optimization techniques
+EDUCATION
+Ph.D. Computer Science | Stanford University | 2013 - 2017 | Stanford, CA
+Dissertation: "Efficient Training of Large-Scale Neural Networks"
+GPA: 3.95/4.0
+M.S. Computer Science | MIT | 2011 - 2013 | Cambridge, MA
+Concentration: Artificial Intelligence | GPA: 3.9/4.0
+SKILLS
+Programming: Python, C++, JavaScript, CUDA, PyTorch, TensorFlow
+Machine Learning: Deep Learning, NLP, Computer Vision, Reinforcement Learning
+Cloud Platforms: AWS, GCP, Azure, Kubernetes, Docker"""
+email_schema = """{
+  "type": "object",
+  "properties": {
+    "participants": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "email": {"type": "string"},
+          "role": {"type": "string"},
+          "organization": {"type": "string"}
+        }
+      }
+    },
+    "requirements": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "id": {"type": "string"},
+          "description": {"type": "string"},
+          "priority": {"type": "string"},
+          "status": {"type": "string"},
+          "source_stakeholder": {"type": "string"}
+        }
+      }
+    },
+    "decisions": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "decision": {"type": "string"},
+          "rationale": {"type": "string"},
+          "stakeholders_involved": {"type": "array", "items": {"type": "string"}},
+          "implementation_impact": {"type": "string"}
+        }
+      }
+    },
+    "timeline": {
+      "type": "object",
+      "properties": {
+        "start_date": {"type": "string"},
+        "key_milestones": {"type": "array", "items": {"type": "string"}},
+        "final_deadline": {"type": "string"}
+      }
+    }
+  }
+}"""
+email_content = """From: [email protected]
+To: [email protected], [email protected]
+Subject: API Rate Limiting Requirements - Final Decision
+Hi team,
+After our discussion yesterday, I wanted to confirm the final requirements for the API rate limiting feature:
+REQ-001: Implement per-user rate limiting at 1000 requests/hour (HIGH priority)
+REQ-002: Add burst capacity of 100 requests/minute (MEDIUM priority)
+REQ-003: Provide rate limit headers in API responses (HIGH priority)
+REQ-004: Create rate limit monitoring dashboard (LOW priority)
+Decision: We'll use Redis for rate limiting storage instead of in-memory due to scalability concerns raised by Mike.
+Rationale: Redis provides persistence and can scale across multiple API instances.
+Implementation impact: Will require Redis infrastructure setup but provides better long-term scalability.
+Timeline:
+- Start development: January 15, 2024
+- Feature complete: February 28, 2024
+- Production deployment: March 15, 2024
+Let me know if you have any questions.
+Best regards,
+John Smith - Product Manager, Acme Corp
+Sarah Johnson - Lead Engineer, TechCorp
+Mike Brown - DevOps Lead, Acme Corp"""
+contract_schema = """{
+  "type": "object",
+  "properties": {
+    "parties": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "type": {"type": "string"},
+          "address": {"type": "string"},
+          "role": {"type": "string"}
+        }
+      }
+    },
+    "contract_details": {
+      "type": "object",
+      "properties": {
+        "contract_value": {"type": "string"},
+        "currency": {"type": "string"},
+        "payment_terms": {"type": "string"},
+        "contract_duration": {"type": "string"},
+        "start_date": {"type": "string"},
+        "end_date": {"type": "string"}
+      }
+    },
+    "key_terms": {
+      "type": "object",
+      "properties": {
+        "liability_cap": {"type": "string"},
+        "termination_clause": {"type": "string"},
+        "intellectual_property": {"type": "string"},
+        "confidentiality_period": {"type": "string"}
+      }
+    },
+    "deliverables": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "description": {"type": "string"},
+          "deadline": {"type": "string"},
+          "acceptance_criteria": {"type": "string"}
+        }
+      }
+    }
+  }
+}"""
+contract_content = """SOFTWARE DEVELOPMENT AGREEMENT
+This Software Development Agreement ("Agreement") is entered into on January 1, 2024, between:
+TechCorp Inc., a Delaware corporation with offices at 123 Silicon Valley Blvd, San Francisco, CA 94105 ("Client")
+AND
+DevSolutions LLC, a California limited liability company with offices at 456 Innovation Drive, Palo Alto, CA 94301 ("Developer")
+CONTRACT TERMS:
+- Total Contract Value: $2,500,000 USD
+- Payment Terms: Net 30 days
+- Contract Duration: 18 months
+- Start Date: January 15, 2024
+- End Date: July 15, 2025
+KEY PROVISIONS:
+- Liability Cap: Limited to total contract value ($2.5M)
+- Termination: Either party may terminate with 90 days written notice
+- Intellectual Property: All developed IP remains with Client
+- Confidentiality: 5-year confidentiality period post-contract
+DELIVERABLES:
+1. API Platform Development
+   - Complete REST API platform with authentication
+   - Deadline: June 1, 2024
+   - Acceptance: Must pass security audit and performance tests
+2. Mobile Application
+   - iOS and Android applications
+   - Deadline: October 1, 2024
+   - Acceptance: App store approval and user acceptance testing
+3. Documentation & Training
+   - Complete technical documentation and user training
+   - Deadline: December 1, 2024
+   - Acceptance: Training completion by 95% of users"""
+with gr.Blocks(title="Unstructured to Structured JSON Converter", theme=gr.themes.Soft()) as app:
+    gr.Markdown("""
+    # 🔄 Unstructured to Structured JSON Converter
+    **A production-ready system for extracting structured data from unstructured text following complex JSON schemas.**
+    ✨ **Key Features:**
+    - Supports unlimited schema complexity (6+ levels, 250+ fields, 500+ enums)
+    - Handles large documents (50+ pages, 10MB+ files)
+    - Dynamic resource allocation ($0.01-$5.00 based on complexity)
+    - Confidence-based quality assessment with human review routing
+    📊 **Performance:** 97-99% time savings vs manual processing with 85-95% accuracy
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 📝 Input Content")
+            content_input = gr.Textbox(
+                label="Unstructured Text Content",
+                placeholder="Paste your document content here...",
+                lines=15,
+                max_lines=25
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("### 🗂️ JSON Schema")
+            schema_input = gr.Textbox(
+                label="Target JSON Schema",
+                placeholder="Paste your JSON schema here...",
+                lines=15,
+                max_lines=25,
+                value=github_schema
+            )
+    with gr.Row():
+        extract_btn = gr.Button("🚀 Extract Structured Data", variant="primary", size="lg")
+        clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("### 📤 Extracted JSON Data")
+            output_json = gr.Textbox(
+                label="Structured Output",
+                lines=20,
+                max_lines=30,
+                show_copy_button=True
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("### 📊 Analysis Results")
+            confidence_output = gr.Textbox(label="Overall Confidence", interactive=False)
+            metadata_output = gr.Textbox(
+                label="Processing Metadata",
+                lines=12,
+                interactive=False
+            )
+            status_output = gr.Textbox(label="Status", interactive=False)
+    gr.Markdown("### 🎯 Example Test Cases")
+    gr.Examples(
+        examples=[
+            [github_content, github_schema],
+            [resume_content, resume_schema],
+            [email_content, email_schema],
+            [contract_content, contract_schema]
+        ],
+        inputs=[content_input, schema_input],
+        label="Click any example to load it:",
+        examples_per_page=4
+    )
+    gr.Markdown("""
+    ### 🔧 How It Works
+    1. **Schema Analysis**: Analyzes complexity (depth, fields, objects) and creates optimal extraction plan
+    2. **Document Processing**: Handles large documents with semantic chunking and context preservation
+    3. **Multi-Stage Extraction**: Uses hierarchical processing with dynamic model selection
+    4. **Quality Assessment**: Provides confidence scores and flags uncertain fields for human review
+    ### 📈 Complexity Tiers
+    | Tier | Depth | Fields | Cost | Time | Use Case |
+    |------|-------|--------|------|------|----------|
+    | **1 (Simple)** | ≤2 levels | ≤20 | $0.01-0.05 | 5-15s | Forms, basic extraction |
+    | **2 (Medium)** | ≤4 levels | ≤100 | $0.08-0.25 | 15-45s | API docs, structured reports |
+    | **3 (Complex)** | >4 levels | >100 | $0.30-2.00 | 45-120s | Legal docs, research papers |
+    ### 🎓 Schema Examples
+    **GitHub Actions** (Medium): Action metadata with inputs/outputs
+    **Resume/CV** (Complex): Personal profile with work history and skills
+    **Email Chains** (Complex): Requirements extraction from stakeholder communications
+    **Legal Contracts** (Complex): Contract terms, parties, and deliverables
+    """)
+    extract_btn.click(
+        fn=extract_wrapper,
+        inputs=[content_input, schema_input],
+        outputs=[output_json, confidence_output, metadata_output, status_output]
+    )
+    clear_btn.click(
+        lambda: ("", "", "", "", ""),
+        outputs=[content_input, schema_input, output_json, confidence_output, metadata_output]
+    )
+if __name__ == "__main__":
+    app.launch()