Spaces:

arjunanand13
/

unstructured-to-structured-converter

Sleeping

File size: 15,727 Bytes

import gradio as gr
import json
import asyncio
import os
from main import StructuredExtractionSystem
import time

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY environment variable is required")

system = StructuredExtractionSystem(api_key)

async def extract_data(content, schema_text, progress=gr.Progress()):
    try:
        progress(0.1, desc="Parsing schema...")
        schema = json.loads(schema_text)
        
        progress(0.5, desc="Processing...")
        result = await system.extract_structured_data(content, schema)
        
        extracted_data = json.dumps(result["data"], indent=2)
        metadata = result["extraction_metadata"]
        
        total_expected = len(schema.get('properties', {}))
        extracted_count = len([k for k, v in result["data"].items() if v is not None and v != ""])
        completeness = extracted_count / total_expected if total_expected > 0 else 0
        
        analysis = f"""Fields Extracted: {extracted_count}/{total_expected} ({completeness:.1%})
Complexity Tier: {metadata['complexity_tier']}
Processing Stages: {metadata['stages_executed']}
Processing Time: {metadata['actual_processing_time']:.2f}s
Schema Compliance: {metadata['schema_compliance']:.1%}"""
        
        status_flags = result.get("review_flags", [])
        
        if completeness >= 0.8 and not any(flag in ["incomplete_extraction", "low_quality", "schema_violations"] for flag in status_flags):
            status = "Success"
        elif completeness >= 0.5:
            status = "Partial Success"
        else:
            status = "Incomplete"
        
        if status_flags:
            analysis += f"\nIssues: {', '.join(status_flags)}"
        
        progress(1.0, desc="Complete")
        return extracted_data, analysis, status
        
    except json.JSONDecodeError as e:
        return "", f"Invalid JSON Schema: {str(e)}", "Schema Error"
    except Exception as e:
        return "", f"Extraction Error: {str(e)}", "Error"

def extract_wrapper(content, schema_text):
    return asyncio.run(extract_data(content, schema_text))

github_schema = """{
  "type": "object",
  "properties": {
    "name": {"type": "string"},
    "description": {"type": "string"},
    "author": {"type": "string"},
    "inputs": {
      "type": "object",
      "patternProperties": {
        "^[_a-zA-Z][a-zA-Z0-9_-]*$": {
          "type": "object",
          "properties": {
            "description": {"type": "string"},
            "required": {"type": "boolean"},
            "default": {"type": "string"}
          }
        }
      }
    },
    "outputs": {
      "type": "object",
      "patternProperties": {
        "^[_a-zA-Z][a-zA-Z0-9_-]*$": {
          "type": "object",
          "properties": {
            "description": {"type": "string"},
            "value": {"type": "string"}
          }
        }
      }
    },
    "runs": {
      "type": "object",
      "properties": {
        "using": {"type": "string"},
        "steps": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "name": {"type": "string"},
              "uses": {"type": "string"},
              "run": {"type": "string"},
              "shell": {"type": "string"}
            }
          }
        }
      }
    },
    "branding": {
      "type": "object",
      "properties": {
        "color": {"type": "string"},
        "icon": {"type": "string"}
      }
    }
  },
  "required": ["name", "description", "runs"]
}"""

github_content = """MkDocs Publisher Action

I keep repeating the steps to build and deploy our MkDocs documentation sites to GitHub Pages across different repos. Let's create a reusable composite action to handle this.

Action Name: MkDocs Publisher
Purpose: A simple action to build an MkDocs site and push it to the gh-pages branch. Should be easy to use. Author should be listed as 'DevRel Team'.

Inputs Needed:
python-version: We need to specify the Python version for setting up the environment. Users should be able to choose. Let's make this optional and default it to 3.11. Description: 'The version of Python to set up for building.'
requirements-file: Users might have different requirements files (e.g., requirements.txt, docs/requirements.txt). This input should specify the path. It's required. Description: 'Path to the Python requirements file'.
gh-token: The GitHub token for pushing to gh-pages. This is absolutely required. Description: 'GitHub token for deployment.'

Outputs:
The action needs to output the URL where the site was deployed. Let's call this output page-url. Its description should be 'The URL of the deployed GitHub Pages site.'

How it Runs:
This will be a composite action (using: composite). Here are the steps involved:
Checkout Code: First, we need the repository code. Use the standard actions/checkout@v4.
Setup Python: Next, set up the Python environment. Use actions/setup-python@v5.
Install Dependencies: Run a command to install the Python packages. The command is pip install -r requirements.txt. Execute this using the bash shell.
Build Site: Run the command mkdocs build. Use bash for this too.
Deploy to Pages: Use peaceiris/actions-gh-pages@v3 for deployment.

Branding: For the marketplace look, let's use the color blue and the book-open icon."""

resume_schema = """{
  "type": "object",
  "properties": {
    "basics": {
      "type": "object",
      "properties": {
        "name": {"type": "string"},
        "label": {"type": "string"},
        "email": {"type": "string"},
        "phone": {"type": "string"},
        "url": {"type": "string"},
        "summary": {"type": "string"},
        "location": {
          "type": "object",
          "properties": {
            "address": {"type": "string"},
            "postalCode": {"type": "string"},
            "city": {"type": "string"},
            "countryCode": {"type": "string"},
            "region": {"type": "string"}
          }
        },
        "profiles": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "network": {"type": "string"},
              "username": {"type": "string"},
              "url": {"type": "string"}
            }
          }
        }
      }
    },
    "work": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": {"type": "string"},
          "location": {"type": "string"},
          "position": {"type": "string"},
          "startDate": {"type": "string"},
          "endDate": {"type": "string"},
          "highlights": {
            "type": "array",
            "items": {"type": "string"}
          }
        }
      }
    },
    "education": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "institution": {"type": "string"},
          "area": {"type": "string"},
          "studyType": {"type": "string"},
          "startDate": {"type": "string"},
          "endDate": {"type": "string"}
        }
      }
    },
    "skills": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": {"type": "string"},
          "keywords": {
            "type": "array",
            "items": {"type": "string"}
          }
        }
      }
    }
  }
}"""

resume_content = """John Doe
Software Engineer
Email: [email protected]
Phone: +1-555-0123
Address: 123 Main St, San Francisco, CA 94105, US
Website: https://johndoe.dev

PROFESSIONAL SUMMARY
Experienced software engineer with 5+ years developing web applications and distributed systems. Skilled in full-stack development with expertise in Python, JavaScript, and cloud technologies.

WORK EXPERIENCE

Senior Software Engineer | TechCorp Inc | San Francisco, CA | 2022 - Present
- Designed and implemented microservices architecture serving 1M+ users
- Led development of real-time data processing pipeline using Apache Kafka
- Reduced system latency by 40% through performance optimization

Software Engineer | StartupXYZ | Palo Alto, CA | 2020 - 2022  
- Built responsive web applications using React and Node.js
- Implemented CI/CD pipelines resulting in 50% faster deployment cycles
- Collaborated with cross-functional teams on product development

EDUCATION

Bachelor of Science in Computer Science | Stanford University | 2016 - 2020
- Relevant Coursework: Data Structures, Algorithms, Database Systems
- Senior Project: Machine Learning Platform for Predictive Analytics

TECHNICAL SKILLS

Programming Languages: Python, JavaScript, Java, Go, SQL
Web Technologies: React, Node.js, HTML/CSS, REST APIs, GraphQL
Cloud & DevOps: AWS, Docker, Kubernetes, Jenkins, Git"""

citation_schema = """{
  "type": "object",
  "properties": {
    "cff-version": {"type": "string"},
    "message": {"type": "string"},
    "title": {"type": "string"},
    "authors": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "given-names": {"type": "string"},
          "family-names": {"type": "string"},
          "affiliation": {"type": "string"},
          "orcid": {"type": "string"}
        }
      }
    },
    "type": {"type": "string"},
    "date-published": {"type": "string"},
    "url": {"type": "string"},
    "abstract": {"type": "string"},
    "keywords": {
      "type": "array",
      "items": {"type": "string"}
    },
    "preferred-citation": {
      "type": "object",
      "properties": {
        "type": {"type": "string"},
        "title": {"type": "string"},
        "authors": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "given-names": {"type": "string"},
              "family-names": {"type": "string"}
            }
          }
        },
        "collection-title": {"type": "string"},
        "volume": {"type": "integer"},
        "year": {"type": "integer"},
        "publisher": {
          "type": "object",
          "properties": {
            "name": {"type": "string"}
          }
        }
      }
    }
  }
}"""

citation_content = """Title: Attention Is All You Need
Authors: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Łukasz Kaiser, Illia Polosukhin

This paper introduces the Transformer, a novel neural network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.

Published in: Advances in Neural Information Processing Systems, Volume 30, 2017
Publisher: Curran Associates, Inc.
URL: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf

The Transformer uses multi-head self-attention to draw global dependencies between input and output. The model achieves state-of-the-art results on machine translation tasks while being more parallelizable and requiring significantly less time to train.

Keywords: attention mechanism, transformer, neural networks, machine translation, self-attention

This work has become foundational for modern NLP models including BERT, GPT, and T5."""

contract_schema = """{
  "type": "object",
  "properties": {
    "contract_type": {"type": "string"},
    "parties": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": {"type": "string"},
          "type": {"type": "string"},
          "address": {"type": "string"}
        }
      }
    },
    "contract_value": {"type": "string"},
    "payment_terms": {"type": "string"},
    "duration": {"type": "string"},
    "start_date": {"type": "string"},
    "deliverables": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": {"type": "string"},
          "deadline": {"type": "string"},
          "description": {"type": "string"}
        }
      }
    },
    "key_terms": {
      "type": "object",
      "properties": {
        "liability_cap": {"type": "string"},
        "termination_notice": {"type": "string"},
        "intellectual_property": {"type": "string"}
      }
    }
  }
}"""

contract_content = """SOFTWARE DEVELOPMENT AGREEMENT

This Agreement is made between:
Client: TechCorp Inc., 123 Business Ave, San Francisco, CA 94105
Contractor: DevStudio LLC, 456 Developer St, Austin, TX 78701

Contract Value: $150,000
Payment Terms: 50% upfront, 50% upon completion
Duration: 6 months
Start Date: January 1, 2024

DELIVERABLES:
1. Web Application Development
   - Complete e-commerce platform with user authentication
   - Deadline: March 15, 2024

2. Mobile App Development  
   - iOS and Android applications
   - Deadline: May 1, 2024

3. API Integration
   - Third-party payment processing integration
   - Deadline: April 15, 2024

KEY TERMS:
- Liability is capped at the total contract value
- Either party may terminate with 30 days written notice
- All intellectual property developed under this agreement belongs to the Client
- Contractor agrees to maintain confidentiality of all proprietary information"""

with gr.Blocks(title="Unstructured to Structured Converter", theme=gr.themes.Default()) as app:
    gr.Markdown("# Unstructured to Structured JSON Converter")
    gr.Markdown("Convert any unstructured text into structured JSON following complex schemas")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Input Content")
            content_input = gr.Textbox(
                label="Document Content",
                placeholder="Enter your unstructured text here...",
                lines=12,
                max_lines=20
            )
            
        with gr.Column():
            gr.Markdown("### JSON Schema")
            schema_input = gr.Textbox(
                label="Target Schema",
                placeholder="Enter your JSON schema here...",
                lines=12,
                max_lines=20,
                value=github_schema
            )
    
    with gr.Row():
        extract_btn = gr.Button("Extract Data", variant="primary")
        clear_btn = gr.Button("Clear")
    
    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("### Extracted Data")
            output_json = gr.Textbox(
                label="JSON Output",
                lines=15,
                show_copy_button=True
            )
            
        with gr.Column(scale=1):
            gr.Markdown("### Results")
            metadata_output = gr.Textbox(label="Analysis", lines=8)
            status_output = gr.Textbox(label="Status")
    
    gr.Markdown("### Test Cases")
    gr.Examples(
        examples=[
            [github_content, github_schema],
            [resume_content, resume_schema],
            [citation_content, citation_schema],
            [contract_content, contract_schema]
        ],
        inputs=[content_input, schema_input],
        label="Select a test case:"
    )
    
    gr.Markdown("""
    ### System Features
    - **Schema Complexity**: Supports 6+ levels nesting, 250+ fields, unlimited enums
    - **Document Size**: Handles 50+ page documents and 10MB+ files
    - **Dynamic Scaling**: Cost ranges from $0.01 to $5.00 based on complexity
    - **Quality Assurance**: Confidence scoring with human review routing
    """)
    
    extract_btn.click(
        fn=extract_wrapper,
        inputs=[content_input, schema_input],
        outputs=[output_json, metadata_output, status_output]
    )
    
    clear_btn.click(
        lambda: ("", "", "", "", ""),
        outputs=[content_input, schema_input, output_json, metadata_output, status_output]
    )

if __name__ == "__main__":
    app.launch()