Spaces:

arjunanand13
/

unstructured-to-structured-converter

Sleeping

App Files Files Community

unstructured-to-structured-converter / app.py

arjunanand13

Update app.py

91a9da3 verified 22 days ago

raw

history blame contribute delete

15.7 kB

	import gradio as gr
	import json
	import asyncio
	import os
	from main import StructuredExtractionSystem
	import time

	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	raise ValueError("OPENAI_API_KEY environment variable is required")

	system = StructuredExtractionSystem(api_key)

	async def extract_data(content, schema_text, progress=gr.Progress()):
	try:
	progress(0.1, desc="Parsing schema...")
	schema = json.loads(schema_text)

	progress(0.5, desc="Processing...")
	result = await system.extract_structured_data(content, schema)

	extracted_data = json.dumps(result["data"], indent=2)
	metadata = result["extraction_metadata"]

	total_expected = len(schema.get('properties', {}))
	extracted_count = len([k for k, v in result["data"].items() if v is not None and v != ""])
	completeness = extracted_count / total_expected if total_expected > 0 else 0

	analysis = f"""Fields Extracted: {extracted_count}/{total_expected} ({completeness:.1%})
	Complexity Tier: {metadata['complexity_tier']}
	Processing Stages: {metadata['stages_executed']}
	Processing Time: {metadata['actual_processing_time']:.2f}s
	Schema Compliance: {metadata['schema_compliance']:.1%}"""

	status_flags = result.get("review_flags", [])

	if completeness >= 0.8 and not any(flag in ["incomplete_extraction", "low_quality", "schema_violations"] for flag in status_flags):
	status = "Success"
	elif completeness >= 0.5:
	status = "Partial Success"
	else:
	status = "Incomplete"

	if status_flags:
	analysis += f"\nIssues: {', '.join(status_flags)}"

	progress(1.0, desc="Complete")
	return extracted_data, analysis, status

	except json.JSONDecodeError as e:
	return "", f"Invalid JSON Schema: {str(e)}", "Schema Error"
	except Exception as e:
	return "", f"Extraction Error: {str(e)}", "Error"

	def extract_wrapper(content, schema_text):
	return asyncio.run(extract_data(content, schema_text))

	github_schema = """{
	"type": "object",
	"properties": {
	"name": {"type": "string"},
	"description": {"type": "string"},
	"author": {"type": "string"},
	"inputs": {
	"type": "object",
	"patternProperties": {
	"^[_a-zA-Z][a-zA-Z0-9_-]*$": {
	"type": "object",
	"properties": {
	"description": {"type": "string"},
	"required": {"type": "boolean"},
	"default": {"type": "string"}
	}
	}
	}
	},
	"outputs": {
	"type": "object",
	"patternProperties": {
	"^[_a-zA-Z][a-zA-Z0-9_-]*$": {
	"type": "object",
	"properties": {
	"description": {"type": "string"},
	"value": {"type": "string"}
	}
	}
	}
	},
	"runs": {
	"type": "object",
	"properties": {
	"using": {"type": "string"},
	"steps": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"name": {"type": "string"},
	"uses": {"type": "string"},
	"run": {"type": "string"},
	"shell": {"type": "string"}
	}
	}
	}
	}
	},
	"branding": {
	"type": "object",
	"properties": {
	"color": {"type": "string"},
	"icon": {"type": "string"}
	}
	}
	},
	"required": ["name", "description", "runs"]
	}"""

	github_content = """MkDocs Publisher Action

	I keep repeating the steps to build and deploy our MkDocs documentation sites to GitHub Pages across different repos. Let's create a reusable composite action to handle this.

	Action Name: MkDocs Publisher
	Purpose: A simple action to build an MkDocs site and push it to the gh-pages branch. Should be easy to use. Author should be listed as 'DevRel Team'.

	Inputs Needed:
	python-version: We need to specify the Python version for setting up the environment. Users should be able to choose. Let's make this optional and default it to 3.11. Description: 'The version of Python to set up for building.'
	requirements-file: Users might have different requirements files (e.g., requirements.txt, docs/requirements.txt). This input should specify the path. It's required. Description: 'Path to the Python requirements file'.
	gh-token: The GitHub token for pushing to gh-pages. This is absolutely required. Description: 'GitHub token for deployment.'

	Outputs:
	The action needs to output the URL where the site was deployed. Let's call this output page-url. Its description should be 'The URL of the deployed GitHub Pages site.'

	How it Runs:
	This will be a composite action (using: composite). Here are the steps involved:
	Checkout Code: First, we need the repository code. Use the standard actions/checkout@v4.
	Setup Python: Next, set up the Python environment. Use actions/setup-python@v5.
	Install Dependencies: Run a command to install the Python packages. The command is pip install -r requirements.txt. Execute this using the bash shell.
	Build Site: Run the command mkdocs build. Use bash for this too.
	Deploy to Pages: Use peaceiris/actions-gh-pages@v3 for deployment.

	Branding: For the marketplace look, let's use the color blue and the book-open icon."""

	resume_schema = """{
	"type": "object",
	"properties": {
	"basics": {
	"type": "object",
	"properties": {
	"name": {"type": "string"},
	"label": {"type": "string"},
	"email": {"type": "string"},
	"phone": {"type": "string"},
	"url": {"type": "string"},
	"summary": {"type": "string"},
	"location": {
	"type": "object",
	"properties": {
	"address": {"type": "string"},
	"postalCode": {"type": "string"},
	"city": {"type": "string"},
	"countryCode": {"type": "string"},
	"region": {"type": "string"}
	}
	},
	"profiles": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"network": {"type": "string"},
	"username": {"type": "string"},
	"url": {"type": "string"}
	}
	}
	}
	}
	},
	"work": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"name": {"type": "string"},
	"location": {"type": "string"},
	"position": {"type": "string"},
	"startDate": {"type": "string"},
	"endDate": {"type": "string"},
	"highlights": {
	"type": "array",
	"items": {"type": "string"}
	}
	}
	}
	},
	"education": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"institution": {"type": "string"},
	"area": {"type": "string"},
	"studyType": {"type": "string"},
	"startDate": {"type": "string"},
	"endDate": {"type": "string"}
	}
	}
	},
	"skills": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"name": {"type": "string"},
	"keywords": {
	"type": "array",
	"items": {"type": "string"}
	}
	}
	}
	}
	}
	}"""

	resume_content = """John Doe
	Software Engineer
	Email: [email protected]
	Phone: +1-555-0123
	Address: 123 Main St, San Francisco, CA 94105, US
	Website: https://johndoe.dev

	PROFESSIONAL SUMMARY
	Experienced software engineer with 5+ years developing web applications and distributed systems. Skilled in full-stack development with expertise in Python, JavaScript, and cloud technologies.

	WORK EXPERIENCE

	Senior Software Engineer \| TechCorp Inc \| San Francisco, CA \| 2022 - Present
	- Designed and implemented microservices architecture serving 1M+ users
	- Led development of real-time data processing pipeline using Apache Kafka
	- Reduced system latency by 40% through performance optimization

	Software Engineer \| StartupXYZ \| Palo Alto, CA \| 2020 - 2022
	- Built responsive web applications using React and Node.js
	- Implemented CI/CD pipelines resulting in 50% faster deployment cycles
	- Collaborated with cross-functional teams on product development

	EDUCATION

	Bachelor of Science in Computer Science \| Stanford University \| 2016 - 2020
	- Relevant Coursework: Data Structures, Algorithms, Database Systems
	- Senior Project: Machine Learning Platform for Predictive Analytics

	TECHNICAL SKILLS

	Programming Languages: Python, JavaScript, Java, Go, SQL
	Web Technologies: React, Node.js, HTML/CSS, REST APIs, GraphQL
	Cloud & DevOps: AWS, Docker, Kubernetes, Jenkins, Git"""

	citation_schema = """{
	"type": "object",
	"properties": {
	"cff-version": {"type": "string"},
	"message": {"type": "string"},
	"title": {"type": "string"},
	"authors": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"given-names": {"type": "string"},
	"family-names": {"type": "string"},
	"affiliation": {"type": "string"},
	"orcid": {"type": "string"}
	}
	}
	},
	"type": {"type": "string"},
	"date-published": {"type": "string"},
	"url": {"type": "string"},
	"abstract": {"type": "string"},
	"keywords": {
	"type": "array",
	"items": {"type": "string"}
	},
	"preferred-citation": {
	"type": "object",
	"properties": {
	"type": {"type": "string"},
	"title": {"type": "string"},
	"authors": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"given-names": {"type": "string"},
	"family-names": {"type": "string"}
	}
	}
	},
	"collection-title": {"type": "string"},
	"volume": {"type": "integer"},
	"year": {"type": "integer"},
	"publisher": {
	"type": "object",
	"properties": {
	"name": {"type": "string"}
	}
	}
	}
	}
	}
	}"""

	citation_content = """Title: Attention Is All You Need
	Authors: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Łukasz Kaiser, Illia Polosukhin

	This paper introduces the Transformer, a novel neural network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.

	Published in: Advances in Neural Information Processing Systems, Volume 30, 2017
	Publisher: Curran Associates, Inc.
	URL: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf

	The Transformer uses multi-head self-attention to draw global dependencies between input and output. The model achieves state-of-the-art results on machine translation tasks while being more parallelizable and requiring significantly less time to train.

	Keywords: attention mechanism, transformer, neural networks, machine translation, self-attention

	This work has become foundational for modern NLP models including BERT, GPT, and T5."""

	contract_schema = """{
	"type": "object",
	"properties": {
	"contract_type": {"type": "string"},
	"parties": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"name": {"type": "string"},
	"type": {"type": "string"},
	"address": {"type": "string"}
	}
	}
	},
	"contract_value": {"type": "string"},
	"payment_terms": {"type": "string"},
	"duration": {"type": "string"},
	"start_date": {"type": "string"},
	"deliverables": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"name": {"type": "string"},
	"deadline": {"type": "string"},
	"description": {"type": "string"}
	}
	}
	},
	"key_terms": {
	"type": "object",
	"properties": {
	"liability_cap": {"type": "string"},
	"termination_notice": {"type": "string"},
	"intellectual_property": {"type": "string"}
	}
	}
	}
	}"""

	contract_content = """SOFTWARE DEVELOPMENT AGREEMENT

	This Agreement is made between:
	Client: TechCorp Inc., 123 Business Ave, San Francisco, CA 94105
	Contractor: DevStudio LLC, 456 Developer St, Austin, TX 78701

	Contract Value: $150,000
	Payment Terms: 50% upfront, 50% upon completion
	Duration: 6 months
	Start Date: January 1, 2024

	DELIVERABLES:
	1. Web Application Development
	- Complete e-commerce platform with user authentication
	- Deadline: March 15, 2024

	2. Mobile App Development
	- iOS and Android applications
	- Deadline: May 1, 2024

	3. API Integration
	- Third-party payment processing integration
	- Deadline: April 15, 2024

	KEY TERMS:
	- Liability is capped at the total contract value
	- Either party may terminate with 30 days written notice
	- All intellectual property developed under this agreement belongs to the Client
	- Contractor agrees to maintain confidentiality of all proprietary information"""

	with gr.Blocks(title="Unstructured to Structured Converter", theme=gr.themes.Default()) as app:
	gr.Markdown("# Unstructured to Structured JSON Converter")
	gr.Markdown("Convert any unstructured text into structured JSON following complex schemas")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Input Content")
	content_input = gr.Textbox(
	label="Document Content",
	placeholder="Enter your unstructured text here...",
	lines=12,
	max_lines=20
	)

	with gr.Column():
	gr.Markdown("### JSON Schema")
	schema_input = gr.Textbox(
	label="Target Schema",
	placeholder="Enter your JSON schema here...",
	lines=12,
	max_lines=20,
	value=github_schema
	)

	with gr.Row():
	extract_btn = gr.Button("Extract Data", variant="primary")
	clear_btn = gr.Button("Clear")

	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("### Extracted Data")
	output_json = gr.Textbox(
	label="JSON Output",
	lines=15,
	show_copy_button=True
	)

	with gr.Column(scale=1):
	gr.Markdown("### Results")
	metadata_output = gr.Textbox(label="Analysis", lines=8)
	status_output = gr.Textbox(label="Status")

	gr.Markdown("### Test Cases")
	gr.Examples(
	examples=[
	[github_content, github_schema],
	[resume_content, resume_schema],
	[citation_content, citation_schema],
	[contract_content, contract_schema]
	],
	inputs=[content_input, schema_input],
	label="Select a test case:"
	)

	gr.Markdown("""
	### System Features
	- Schema Complexity: Supports 6+ levels nesting, 250+ fields, unlimited enums
	- Document Size: Handles 50+ page documents and 10MB+ files
	- Dynamic Scaling: Cost ranges from $0.01 to $5.00 based on complexity
	- Quality Assurance: Confidence scoring with human review routing
	""")

	extract_btn.click(
	fn=extract_wrapper,
	inputs=[content_input, schema_input],
	outputs=[output_json, metadata_output, status_output]
	)

	clear_btn.click(
	lambda: ("", "", "", "", ""),
	outputs=[content_input, schema_input, output_json, metadata_output, status_output]
	)

	if __name__ == "__main__":
	app.launch()