Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,520 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
import asyncio
|
4 |
+
import os
|
5 |
+
from main import StructuredExtractionSystem
|
6 |
+
import time
|
7 |
+
|
8 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
9 |
+
if not api_key:
|
10 |
+
raise ValueError("OPENAI_API_KEY environment variable is required")
|
11 |
+
|
12 |
+
system = StructuredExtractionSystem(api_key)
|
13 |
+
|
14 |
+
async def extract_data(content, schema_text, progress=gr.Progress()):
|
15 |
+
try:
|
16 |
+
progress(0.1, desc="Parsing schema...")
|
17 |
+
schema = json.loads(schema_text)
|
18 |
+
|
19 |
+
progress(0.3, desc="Analyzing complexity...")
|
20 |
+
|
21 |
+
progress(0.5, desc="Extracting data...")
|
22 |
+
result = await system.extract_structured_data(content, schema)
|
23 |
+
|
24 |
+
progress(0.9, desc="Finalizing results...")
|
25 |
+
|
26 |
+
extracted_data = json.dumps(result["data"], indent=2)
|
27 |
+
confidence = f"{result['overall_confidence']:.1%}"
|
28 |
+
metadata = result["extraction_metadata"]
|
29 |
+
|
30 |
+
complexity_info = f"""
|
31 |
+
**Schema Analysis:**
|
32 |
+
- Complexity Tier: {metadata['complexity_tier']}
|
33 |
+
- Processing Stages: {metadata['stages_executed']}
|
34 |
+
- Estimated Cost: ${metadata['estimated_cost']:.3f}
|
35 |
+
- Processing Time: {metadata['actual_processing_time']:.2f}s
|
36 |
+
- Schema Compliance: {metadata['schema_compliance']:.1%}
|
37 |
+
"""
|
38 |
+
|
39 |
+
review_info = ""
|
40 |
+
if result["review_flags"]:
|
41 |
+
review_info = f"\n**Review Required:** {', '.join(result['review_flags'])}"
|
42 |
+
review_info += f"\nEstimated Review Time: {metadata['recommended_review_time']} minutes"
|
43 |
+
|
44 |
+
progress(1.0, desc="Complete!")
|
45 |
+
|
46 |
+
return extracted_data, confidence, complexity_info + review_info, "β
Success"
|
47 |
+
|
48 |
+
except json.JSONDecodeError as e:
|
49 |
+
return "", "0%", f"β Invalid JSON Schema: {str(e)}", "β Schema Error"
|
50 |
+
except Exception as e:
|
51 |
+
return "", "0%", f"β Extraction Error: {str(e)}", "β Error"
|
52 |
+
|
53 |
+
def extract_wrapper(content, schema_text):
|
54 |
+
return asyncio.run(extract_data(content, schema_text))
|
55 |
+
|
56 |
+
github_schema = """{
|
57 |
+
"type": "object",
|
58 |
+
"properties": {
|
59 |
+
"name": {"type": "string"},
|
60 |
+
"description": {"type": "string"},
|
61 |
+
"author": {"type": "string"},
|
62 |
+
"inputs": {
|
63 |
+
"type": "object",
|
64 |
+
"patternProperties": {
|
65 |
+
"^[_a-zA-Z][a-zA-Z0-9_-]*$": {
|
66 |
+
"type": "object",
|
67 |
+
"properties": {
|
68 |
+
"description": {"type": "string"},
|
69 |
+
"required": {"type": "boolean"},
|
70 |
+
"default": {"type": "string"}
|
71 |
+
}
|
72 |
+
}
|
73 |
+
}
|
74 |
+
},
|
75 |
+
"outputs": {
|
76 |
+
"type": "object",
|
77 |
+
"patternProperties": {
|
78 |
+
"^[_a-zA-Z][a-zA-Z0-9_-]*$": {
|
79 |
+
"type": "object",
|
80 |
+
"properties": {
|
81 |
+
"description": {"type": "string"},
|
82 |
+
"value": {"type": "string"}
|
83 |
+
}
|
84 |
+
}
|
85 |
+
}
|
86 |
+
},
|
87 |
+
"runs": {
|
88 |
+
"type": "object",
|
89 |
+
"properties": {
|
90 |
+
"using": {"type": "string"},
|
91 |
+
"steps": {
|
92 |
+
"type": "array",
|
93 |
+
"items": {
|
94 |
+
"type": "object",
|
95 |
+
"properties": {
|
96 |
+
"name": {"type": "string"},
|
97 |
+
"uses": {"type": "string"},
|
98 |
+
"run": {"type": "string"},
|
99 |
+
"shell": {"type": "string"}
|
100 |
+
}
|
101 |
+
}
|
102 |
+
}
|
103 |
+
}
|
104 |
+
},
|
105 |
+
"branding": {
|
106 |
+
"type": "object",
|
107 |
+
"properties": {
|
108 |
+
"color": {"type": "string"},
|
109 |
+
"icon": {"type": "string"}
|
110 |
+
}
|
111 |
+
}
|
112 |
+
},
|
113 |
+
"required": ["name", "description", "runs"]
|
114 |
+
}"""
|
115 |
+
|
116 |
+
github_content = """MkDocs Publisher Action
|
117 |
+
|
118 |
+
This is a composite action that builds an MkDocs documentation site and deploys it to GitHub Pages.
|
119 |
+
It's designed to be reusable across multiple repositories.
|
120 |
+
|
121 |
+
Author: DevRel Team
|
122 |
+
|
123 |
+
The action requires:
|
124 |
+
- python-version: Python version to use (default: 3.11)
|
125 |
+
- requirements-file: Path to requirements file (required)
|
126 |
+
- gh-token: GitHub token for deployment (required)
|
127 |
+
|
128 |
+
The action outputs the URL where the site was deployed.
|
129 |
+
|
130 |
+
The action runs these steps:
|
131 |
+
1. Checkout the repository code using actions/checkout@v4
|
132 |
+
2. Setup Python environment using actions/setup-python@v5
|
133 |
+
3. Install dependencies: pip install -r requirements.txt
|
134 |
+
4. Build the MkDocs site: mkdocs build
|
135 |
+
5. Deploy to GitHub Pages using peaceiris/actions-gh-pages@v3
|
136 |
+
|
137 |
+
Branding: Use blue color with book-open icon."""
|
138 |
+
|
139 |
+
resume_schema = """{
|
140 |
+
"type": "object",
|
141 |
+
"properties": {
|
142 |
+
"basics": {
|
143 |
+
"type": "object",
|
144 |
+
"properties": {
|
145 |
+
"name": {"type": "string"},
|
146 |
+
"label": {"type": "string"},
|
147 |
+
"email": {"type": "string"},
|
148 |
+
"phone": {"type": "string"},
|
149 |
+
"website": {"type": "string"},
|
150 |
+
"summary": {"type": "string"},
|
151 |
+
"location": {
|
152 |
+
"type": "object",
|
153 |
+
"properties": {
|
154 |
+
"city": {"type": "string"},
|
155 |
+
"region": {"type": "string"},
|
156 |
+
"countryCode": {"type": "string"}
|
157 |
+
}
|
158 |
+
}
|
159 |
+
}
|
160 |
+
},
|
161 |
+
"work": {
|
162 |
+
"type": "array",
|
163 |
+
"items": {
|
164 |
+
"type": "object",
|
165 |
+
"properties": {
|
166 |
+
"name": {"type": "string"},
|
167 |
+
"position": {"type": "string"},
|
168 |
+
"location": {"type": "string"},
|
169 |
+
"startDate": {"type": "string"},
|
170 |
+
"endDate": {"type": "string"},
|
171 |
+
"highlights": {
|
172 |
+
"type": "array",
|
173 |
+
"items": {"type": "string"}
|
174 |
+
}
|
175 |
+
}
|
176 |
+
}
|
177 |
+
},
|
178 |
+
"education": {
|
179 |
+
"type": "array",
|
180 |
+
"items": {
|
181 |
+
"type": "object",
|
182 |
+
"properties": {
|
183 |
+
"institution": {"type": "string"},
|
184 |
+
"area": {"type": "string"},
|
185 |
+
"studyType": {"type": "string"},
|
186 |
+
"startDate": {"type": "string"},
|
187 |
+
"endDate": {"type": "string"},
|
188 |
+
"score": {"type": "string"}
|
189 |
+
}
|
190 |
+
}
|
191 |
+
},
|
192 |
+
"skills": {
|
193 |
+
"type": "array",
|
194 |
+
"items": {
|
195 |
+
"type": "object",
|
196 |
+
"properties": {
|
197 |
+
"name": {"type": "string"},
|
198 |
+
"keywords": {
|
199 |
+
"type": "array",
|
200 |
+
"items": {"type": "string"}
|
201 |
+
}
|
202 |
+
}
|
203 |
+
}
|
204 |
+
}
|
205 |
+
}
|
206 |
+
}"""
|
207 |
+
|
208 |
+
resume_content = """Sarah Chen - Senior AI Research Scientist
|
209 |
+
Email: [email protected] | Phone: +1-555-0123
|
210 |
+
Location: Palo Alto, California, United States
|
211 |
+
Website: https://sarahchen.ai
|
212 |
+
|
213 |
+
SUMMARY
|
214 |
+
Experienced AI research scientist with 8+ years in machine learning, deep learning, and natural language processing. Led teams that developed production ML systems serving millions of users.
|
215 |
+
|
216 |
+
WORK EXPERIENCE
|
217 |
+
|
218 |
+
Senior AI Research Scientist | OpenAI | 2021 - Present | San Francisco, CA
|
219 |
+
β’ Led development of GPT-4 training infrastructure, improving training efficiency by 40%
|
220 |
+
β’ Designed novel attention mechanisms for transformer architectures
|
221 |
+
β’ Managed team of 12 researchers across multiple ML projects
|
222 |
+
|
223 |
+
Machine Learning Engineer | Google Brain | 2019 - 2021 | Mountain View, CA
|
224 |
+
β’ Developed recommendation systems serving 500M+ users daily
|
225 |
+
β’ Implemented distributed training frameworks for large-scale models
|
226 |
+
β’ Reduced model inference latency by 60% through optimization techniques
|
227 |
+
|
228 |
+
EDUCATION
|
229 |
+
|
230 |
+
Ph.D. Computer Science | Stanford University | 2013 - 2017 | Stanford, CA
|
231 |
+
Dissertation: "Efficient Training of Large-Scale Neural Networks"
|
232 |
+
GPA: 3.95/4.0
|
233 |
+
|
234 |
+
M.S. Computer Science | MIT | 2011 - 2013 | Cambridge, MA
|
235 |
+
Concentration: Artificial Intelligence | GPA: 3.9/4.0
|
236 |
+
|
237 |
+
SKILLS
|
238 |
+
Programming: Python, C++, JavaScript, CUDA, PyTorch, TensorFlow
|
239 |
+
Machine Learning: Deep Learning, NLP, Computer Vision, Reinforcement Learning
|
240 |
+
Cloud Platforms: AWS, GCP, Azure, Kubernetes, Docker"""
|
241 |
+
|
242 |
+
email_schema = """{
|
243 |
+
"type": "object",
|
244 |
+
"properties": {
|
245 |
+
"participants": {
|
246 |
+
"type": "array",
|
247 |
+
"items": {
|
248 |
+
"type": "object",
|
249 |
+
"properties": {
|
250 |
+
"name": {"type": "string"},
|
251 |
+
"email": {"type": "string"},
|
252 |
+
"role": {"type": "string"},
|
253 |
+
"organization": {"type": "string"}
|
254 |
+
}
|
255 |
+
}
|
256 |
+
},
|
257 |
+
"requirements": {
|
258 |
+
"type": "array",
|
259 |
+
"items": {
|
260 |
+
"type": "object",
|
261 |
+
"properties": {
|
262 |
+
"id": {"type": "string"},
|
263 |
+
"description": {"type": "string"},
|
264 |
+
"priority": {"type": "string"},
|
265 |
+
"status": {"type": "string"},
|
266 |
+
"source_stakeholder": {"type": "string"}
|
267 |
+
}
|
268 |
+
}
|
269 |
+
},
|
270 |
+
"decisions": {
|
271 |
+
"type": "array",
|
272 |
+
"items": {
|
273 |
+
"type": "object",
|
274 |
+
"properties": {
|
275 |
+
"decision": {"type": "string"},
|
276 |
+
"rationale": {"type": "string"},
|
277 |
+
"stakeholders_involved": {"type": "array", "items": {"type": "string"}},
|
278 |
+
"implementation_impact": {"type": "string"}
|
279 |
+
}
|
280 |
+
}
|
281 |
+
},
|
282 |
+
"timeline": {
|
283 |
+
"type": "object",
|
284 |
+
"properties": {
|
285 |
+
"start_date": {"type": "string"},
|
286 |
+
"key_milestones": {"type": "array", "items": {"type": "string"}},
|
287 |
+
"final_deadline": {"type": "string"}
|
288 |
+
}
|
289 |
+
}
|
290 |
+
}
|
291 |
+
}"""
|
292 |
+
|
293 |
+
email_content = """From: [email protected]
|
294 | |
295 |
+
Subject: API Rate Limiting Requirements - Final Decision
|
296 |
+
|
297 |
+
Hi team,
|
298 |
+
|
299 |
+
After our discussion yesterday, I wanted to confirm the final requirements for the API rate limiting feature:
|
300 |
+
|
301 |
+
REQ-001: Implement per-user rate limiting at 1000 requests/hour (HIGH priority)
|
302 |
+
REQ-002: Add burst capacity of 100 requests/minute (MEDIUM priority)
|
303 |
+
REQ-003: Provide rate limit headers in API responses (HIGH priority)
|
304 |
+
REQ-004: Create rate limit monitoring dashboard (LOW priority)
|
305 |
+
|
306 |
+
Decision: We'll use Redis for rate limiting storage instead of in-memory due to scalability concerns raised by Mike.
|
307 |
+
Rationale: Redis provides persistence and can scale across multiple API instances.
|
308 |
+
|
309 |
+
Implementation impact: Will require Redis infrastructure setup but provides better long-term scalability.
|
310 |
+
|
311 |
+
Timeline:
|
312 |
+
- Start development: January 15, 2024
|
313 |
+
- Feature complete: February 28, 2024
|
314 |
+
- Production deployment: March 15, 2024
|
315 |
+
|
316 |
+
Let me know if you have any questions.
|
317 |
+
|
318 |
+
Best regards,
|
319 |
+
John Smith - Product Manager, Acme Corp
|
320 |
+
Sarah Johnson - Lead Engineer, TechCorp
|
321 |
+
Mike Brown - DevOps Lead, Acme Corp"""
|
322 |
+
|
323 |
+
contract_schema = """{
|
324 |
+
"type": "object",
|
325 |
+
"properties": {
|
326 |
+
"parties": {
|
327 |
+
"type": "array",
|
328 |
+
"items": {
|
329 |
+
"type": "object",
|
330 |
+
"properties": {
|
331 |
+
"name": {"type": "string"},
|
332 |
+
"type": {"type": "string"},
|
333 |
+
"address": {"type": "string"},
|
334 |
+
"role": {"type": "string"}
|
335 |
+
}
|
336 |
+
}
|
337 |
+
},
|
338 |
+
"contract_details": {
|
339 |
+
"type": "object",
|
340 |
+
"properties": {
|
341 |
+
"contract_value": {"type": "string"},
|
342 |
+
"currency": {"type": "string"},
|
343 |
+
"payment_terms": {"type": "string"},
|
344 |
+
"contract_duration": {"type": "string"},
|
345 |
+
"start_date": {"type": "string"},
|
346 |
+
"end_date": {"type": "string"}
|
347 |
+
}
|
348 |
+
},
|
349 |
+
"key_terms": {
|
350 |
+
"type": "object",
|
351 |
+
"properties": {
|
352 |
+
"liability_cap": {"type": "string"},
|
353 |
+
"termination_clause": {"type": "string"},
|
354 |
+
"intellectual_property": {"type": "string"},
|
355 |
+
"confidentiality_period": {"type": "string"}
|
356 |
+
}
|
357 |
+
},
|
358 |
+
"deliverables": {
|
359 |
+
"type": "array",
|
360 |
+
"items": {
|
361 |
+
"type": "object",
|
362 |
+
"properties": {
|
363 |
+
"name": {"type": "string"},
|
364 |
+
"description": {"type": "string"},
|
365 |
+
"deadline": {"type": "string"},
|
366 |
+
"acceptance_criteria": {"type": "string"}
|
367 |
+
}
|
368 |
+
}
|
369 |
+
}
|
370 |
+
}
|
371 |
+
}"""
|
372 |
+
|
373 |
+
contract_content = """SOFTWARE DEVELOPMENT AGREEMENT
|
374 |
+
|
375 |
+
This Software Development Agreement ("Agreement") is entered into on January 1, 2024, between:
|
376 |
+
|
377 |
+
TechCorp Inc., a Delaware corporation with offices at 123 Silicon Valley Blvd, San Francisco, CA 94105 ("Client")
|
378 |
+
|
379 |
+
AND
|
380 |
+
|
381 |
+
DevSolutions LLC, a California limited liability company with offices at 456 Innovation Drive, Palo Alto, CA 94301 ("Developer")
|
382 |
+
|
383 |
+
CONTRACT TERMS:
|
384 |
+
- Total Contract Value: $2,500,000 USD
|
385 |
+
- Payment Terms: Net 30 days
|
386 |
+
- Contract Duration: 18 months
|
387 |
+
- Start Date: January 15, 2024
|
388 |
+
- End Date: July 15, 2025
|
389 |
+
|
390 |
+
KEY PROVISIONS:
|
391 |
+
- Liability Cap: Limited to total contract value ($2.5M)
|
392 |
+
- Termination: Either party may terminate with 90 days written notice
|
393 |
+
- Intellectual Property: All developed IP remains with Client
|
394 |
+
- Confidentiality: 5-year confidentiality period post-contract
|
395 |
+
|
396 |
+
DELIVERABLES:
|
397 |
+
1. API Platform Development
|
398 |
+
- Complete REST API platform with authentication
|
399 |
+
- Deadline: June 1, 2024
|
400 |
+
- Acceptance: Must pass security audit and performance tests
|
401 |
+
|
402 |
+
2. Mobile Application
|
403 |
+
- iOS and Android applications
|
404 |
+
- Deadline: October 1, 2024
|
405 |
+
- Acceptance: App store approval and user acceptance testing
|
406 |
+
|
407 |
+
3. Documentation & Training
|
408 |
+
- Complete technical documentation and user training
|
409 |
+
- Deadline: December 1, 2024
|
410 |
+
- Acceptance: Training completion by 95% of users"""
|
411 |
+
|
412 |
+
with gr.Blocks(title="Unstructured to Structured JSON Converter", theme=gr.themes.Soft()) as app:
|
413 |
+
gr.Markdown("""
|
414 |
+
# π Unstructured to Structured JSON Converter
|
415 |
+
|
416 |
+
**A production-ready system for extracting structured data from unstructured text following complex JSON schemas.**
|
417 |
+
|
418 |
+
β¨ **Key Features:**
|
419 |
+
- Supports unlimited schema complexity (6+ levels, 250+ fields, 500+ enums)
|
420 |
+
- Handles large documents (50+ pages, 10MB+ files)
|
421 |
+
- Dynamic resource allocation ($0.01-$5.00 based on complexity)
|
422 |
+
- Confidence-based quality assessment with human review routing
|
423 |
+
|
424 |
+
π **Performance:** 97-99% time savings vs manual processing with 85-95% accuracy
|
425 |
+
""")
|
426 |
+
|
427 |
+
with gr.Row():
|
428 |
+
with gr.Column(scale=1):
|
429 |
+
gr.Markdown("### π Input Content")
|
430 |
+
content_input = gr.Textbox(
|
431 |
+
label="Unstructured Text Content",
|
432 |
+
placeholder="Paste your document content here...",
|
433 |
+
lines=15,
|
434 |
+
max_lines=25
|
435 |
+
)
|
436 |
+
|
437 |
+
with gr.Column(scale=1):
|
438 |
+
gr.Markdown("### ποΈ JSON Schema")
|
439 |
+
schema_input = gr.Textbox(
|
440 |
+
label="Target JSON Schema",
|
441 |
+
placeholder="Paste your JSON schema here...",
|
442 |
+
lines=15,
|
443 |
+
max_lines=25,
|
444 |
+
value=github_schema
|
445 |
+
)
|
446 |
+
|
447 |
+
with gr.Row():
|
448 |
+
extract_btn = gr.Button("π Extract Structured Data", variant="primary", size="lg")
|
449 |
+
clear_btn = gr.Button("ποΈ Clear", variant="secondary")
|
450 |
+
|
451 |
+
with gr.Row():
|
452 |
+
with gr.Column(scale=2):
|
453 |
+
gr.Markdown("### π€ Extracted JSON Data")
|
454 |
+
output_json = gr.Textbox(
|
455 |
+
label="Structured Output",
|
456 |
+
lines=20,
|
457 |
+
max_lines=30,
|
458 |
+
show_copy_button=True
|
459 |
+
)
|
460 |
+
|
461 |
+
with gr.Column(scale=1):
|
462 |
+
gr.Markdown("### π Analysis Results")
|
463 |
+
confidence_output = gr.Textbox(label="Overall Confidence", interactive=False)
|
464 |
+
metadata_output = gr.Textbox(
|
465 |
+
label="Processing Metadata",
|
466 |
+
lines=12,
|
467 |
+
interactive=False
|
468 |
+
)
|
469 |
+
status_output = gr.Textbox(label="Status", interactive=False)
|
470 |
+
|
471 |
+
gr.Markdown("### π― Example Test Cases")
|
472 |
+
gr.Examples(
|
473 |
+
examples=[
|
474 |
+
[github_content, github_schema],
|
475 |
+
[resume_content, resume_schema],
|
476 |
+
[email_content, email_schema],
|
477 |
+
[contract_content, contract_schema]
|
478 |
+
],
|
479 |
+
inputs=[content_input, schema_input],
|
480 |
+
label="Click any example to load it:",
|
481 |
+
examples_per_page=4
|
482 |
+
)
|
483 |
+
|
484 |
+
gr.Markdown("""
|
485 |
+
### π§ How It Works
|
486 |
+
|
487 |
+
1. **Schema Analysis**: Analyzes complexity (depth, fields, objects) and creates optimal extraction plan
|
488 |
+
2. **Document Processing**: Handles large documents with semantic chunking and context preservation
|
489 |
+
3. **Multi-Stage Extraction**: Uses hierarchical processing with dynamic model selection
|
490 |
+
4. **Quality Assessment**: Provides confidence scores and flags uncertain fields for human review
|
491 |
+
|
492 |
+
### π Complexity Tiers
|
493 |
+
|
494 |
+
| Tier | Depth | Fields | Cost | Time | Use Case |
|
495 |
+
|------|-------|--------|------|------|----------|
|
496 |
+
| **1 (Simple)** | β€2 levels | β€20 | $0.01-0.05 | 5-15s | Forms, basic extraction |
|
497 |
+
| **2 (Medium)** | β€4 levels | β€100 | $0.08-0.25 | 15-45s | API docs, structured reports |
|
498 |
+
| **3 (Complex)** | >4 levels | >100 | $0.30-2.00 | 45-120s | Legal docs, research papers |
|
499 |
+
|
500 |
+
### π Schema Examples
|
501 |
+
|
502 |
+
**GitHub Actions** (Medium): Action metadata with inputs/outputs
|
503 |
+
**Resume/CV** (Complex): Personal profile with work history and skills
|
504 |
+
**Email Chains** (Complex): Requirements extraction from stakeholder communications
|
505 |
+
**Legal Contracts** (Complex): Contract terms, parties, and deliverables
|
506 |
+
""")
|
507 |
+
|
508 |
+
extract_btn.click(
|
509 |
+
fn=extract_wrapper,
|
510 |
+
inputs=[content_input, schema_input],
|
511 |
+
outputs=[output_json, confidence_output, metadata_output, status_output]
|
512 |
+
)
|
513 |
+
|
514 |
+
clear_btn.click(
|
515 |
+
lambda: ("", "", "", "", ""),
|
516 |
+
outputs=[content_input, schema_input, output_json, confidence_output, metadata_output]
|
517 |
+
)
|
518 |
+
|
519 |
+
if __name__ == "__main__":
|
520 |
+
app.launch()
|