Update app.py
Browse files
app.py
CHANGED
@@ -16,39 +16,30 @@ async def extract_data(content, schema_text, progress=gr.Progress()):
|
|
16 |
progress(0.1, desc="Parsing schema...")
|
17 |
schema = json.loads(schema_text)
|
18 |
|
19 |
-
progress(0.
|
20 |
-
|
21 |
-
progress(0.5, desc="Extracting data...")
|
22 |
result = await system.extract_structured_data(content, schema)
|
23 |
|
24 |
-
progress(0.9, desc="Finalizing results...")
|
25 |
-
|
26 |
extracted_data = json.dumps(result["data"], indent=2)
|
27 |
confidence = f"{result['overall_confidence']:.1%}"
|
28 |
metadata = result["extraction_metadata"]
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
- Processing Time: {metadata['actual_processing_time']:.2f}s
|
36 |
-
- Schema Compliance: {metadata['schema_compliance']:.1%}
|
37 |
-
"""
|
38 |
|
39 |
-
review_info = ""
|
40 |
if result["review_flags"]:
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
progress(1.0, desc="Complete!")
|
45 |
|
46 |
-
|
|
|
47 |
|
48 |
except json.JSONDecodeError as e:
|
49 |
-
return "", "0%", f"
|
50 |
except Exception as e:
|
51 |
-
return "", "0%", f"
|
52 |
|
53 |
def extract_wrapper(content, schema_text):
|
54 |
return asyncio.run(extract_data(content, schema_text))
|
@@ -115,26 +106,28 @@ github_schema = """{
|
|
115 |
|
116 |
github_content = """MkDocs Publisher Action
|
117 |
|
118 |
-
|
119 |
-
It's designed to be reusable across multiple repositories.
|
120 |
|
121 |
-
|
|
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
- gh-
|
127 |
|
128 |
-
|
|
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
136 |
|
137 |
-
Branding:
|
138 |
|
139 |
resume_schema = """{
|
140 |
"type": "object",
|
@@ -146,14 +139,27 @@ resume_schema = """{
|
|
146 |
"label": {"type": "string"},
|
147 |
"email": {"type": "string"},
|
148 |
"phone": {"type": "string"},
|
149 |
-
"
|
150 |
"summary": {"type": "string"},
|
151 |
"location": {
|
152 |
"type": "object",
|
153 |
"properties": {
|
|
|
|
|
154 |
"city": {"type": "string"},
|
155 |
-
"
|
156 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
}
|
158 |
}
|
159 |
}
|
@@ -164,8 +170,8 @@ resume_schema = """{
|
|
164 |
"type": "object",
|
165 |
"properties": {
|
166 |
"name": {"type": "string"},
|
167 |
-
"position": {"type": "string"},
|
168 |
"location": {"type": "string"},
|
|
|
169 |
"startDate": {"type": "string"},
|
170 |
"endDate": {"type": "string"},
|
171 |
"highlights": {
|
@@ -184,8 +190,7 @@ resume_schema = """{
|
|
184 |
"area": {"type": "string"},
|
185 |
"studyType": {"type": "string"},
|
186 |
"startDate": {"type": "string"},
|
187 |
-
"endDate": {"type": "string"}
|
188 |
-
"score": {"type": "string"}
|
189 |
}
|
190 |
}
|
191 |
},
|
@@ -205,124 +210,114 @@ resume_schema = """{
|
|
205 |
}
|
206 |
}"""
|
207 |
|
208 |
-
resume_content = """
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
|
|
212 |
|
213 |
-
SUMMARY
|
214 |
-
Experienced
|
215 |
|
216 |
WORK EXPERIENCE
|
217 |
|
218 |
-
Senior
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
|
228 |
EDUCATION
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
|
234 |
-
|
235 |
-
Concentration: Artificial Intelligence | GPA: 3.9/4.0
|
236 |
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
Cloud Platforms: AWS, GCP, Azure, Kubernetes, Docker"""
|
241 |
|
242 |
-
|
243 |
"type": "object",
|
244 |
"properties": {
|
245 |
-
"
|
|
|
|
|
|
|
246 |
"type": "array",
|
247 |
"items": {
|
248 |
"type": "object",
|
249 |
"properties": {
|
250 |
-
"
|
251 |
-
"
|
252 |
-
"
|
253 |
-
"
|
254 |
}
|
255 |
}
|
256 |
},
|
257 |
-
"
|
|
|
|
|
|
|
|
|
258 |
"type": "array",
|
259 |
-
"items": {
|
260 |
-
"type": "object",
|
261 |
-
"properties": {
|
262 |
-
"id": {"type": "string"},
|
263 |
-
"description": {"type": "string"},
|
264 |
-
"priority": {"type": "string"},
|
265 |
-
"status": {"type": "string"},
|
266 |
-
"source_stakeholder": {"type": "string"}
|
267 |
-
}
|
268 |
-
}
|
269 |
},
|
270 |
-
"
|
271 |
-
"type": "array",
|
272 |
-
"items": {
|
273 |
-
"type": "object",
|
274 |
-
"properties": {
|
275 |
-
"decision": {"type": "string"},
|
276 |
-
"rationale": {"type": "string"},
|
277 |
-
"stakeholders_involved": {"type": "array", "items": {"type": "string"}},
|
278 |
-
"implementation_impact": {"type": "string"}
|
279 |
-
}
|
280 |
-
}
|
281 |
-
},
|
282 |
-
"timeline": {
|
283 |
"type": "object",
|
284 |
"properties": {
|
285 |
-
"
|
286 |
-
"
|
287 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
}
|
289 |
}
|
290 |
}
|
291 |
}"""
|
292 |
|
293 |
-
|
294 | |
295 |
-
Subject: API Rate Limiting Requirements - Final Decision
|
296 |
|
297 |
-
|
298 |
|
299 |
-
|
|
|
|
|
300 |
|
301 |
-
|
302 |
-
REQ-002: Add burst capacity of 100 requests/minute (MEDIUM priority)
|
303 |
-
REQ-003: Provide rate limit headers in API responses (HIGH priority)
|
304 |
-
REQ-004: Create rate limit monitoring dashboard (LOW priority)
|
305 |
|
306 |
-
|
307 |
-
Rationale: Redis provides persistence and can scale across multiple API instances.
|
308 |
|
309 |
-
|
310 |
-
|
311 |
-
Timeline:
|
312 |
-
- Start development: January 15, 2024
|
313 |
-
- Feature complete: February 28, 2024
|
314 |
-
- Production deployment: March 15, 2024
|
315 |
-
|
316 |
-
Let me know if you have any questions.
|
317 |
-
|
318 |
-
Best regards,
|
319 |
-
John Smith - Product Manager, Acme Corp
|
320 |
-
Sarah Johnson - Lead Engineer, TechCorp
|
321 |
-
Mike Brown - DevOps Lead, Acme Corp"""
|
322 |
|
323 |
contract_schema = """{
|
324 |
"type": "object",
|
325 |
"properties": {
|
|
|
326 |
"parties": {
|
327 |
"type": "array",
|
328 |
"items": {
|
@@ -330,179 +325,127 @@ contract_schema = """{
|
|
330 |
"properties": {
|
331 |
"name": {"type": "string"},
|
332 |
"type": {"type": "string"},
|
333 |
-
"address": {"type": "string"}
|
334 |
-
"role": {"type": "string"}
|
335 |
}
|
336 |
}
|
337 |
},
|
338 |
-
"
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
"currency": {"type": "string"},
|
343 |
-
"payment_terms": {"type": "string"},
|
344 |
-
"contract_duration": {"type": "string"},
|
345 |
-
"start_date": {"type": "string"},
|
346 |
-
"end_date": {"type": "string"}
|
347 |
-
}
|
348 |
-
},
|
349 |
-
"key_terms": {
|
350 |
-
"type": "object",
|
351 |
-
"properties": {
|
352 |
-
"liability_cap": {"type": "string"},
|
353 |
-
"termination_clause": {"type": "string"},
|
354 |
-
"intellectual_property": {"type": "string"},
|
355 |
-
"confidentiality_period": {"type": "string"}
|
356 |
-
}
|
357 |
-
},
|
358 |
"deliverables": {
|
359 |
"type": "array",
|
360 |
"items": {
|
361 |
"type": "object",
|
362 |
"properties": {
|
363 |
"name": {"type": "string"},
|
364 |
-
"description": {"type": "string"},
|
365 |
"deadline": {"type": "string"},
|
366 |
-
"
|
367 |
}
|
368 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
}
|
370 |
}
|
371 |
}"""
|
372 |
|
373 |
contract_content = """SOFTWARE DEVELOPMENT AGREEMENT
|
374 |
|
375 |
-
This
|
|
|
|
|
376 |
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
DevSolutions LLC, a California limited liability company with offices at 456 Innovation Drive, Palo Alto, CA 94301 ("Developer")
|
382 |
-
|
383 |
-
CONTRACT TERMS:
|
384 |
-
- Total Contract Value: $2,500,000 USD
|
385 |
-
- Payment Terms: Net 30 days
|
386 |
-
- Contract Duration: 18 months
|
387 |
-
- Start Date: January 15, 2024
|
388 |
-
- End Date: July 15, 2025
|
389 |
-
|
390 |
-
KEY PROVISIONS:
|
391 |
-
- Liability Cap: Limited to total contract value ($2.5M)
|
392 |
-
- Termination: Either party may terminate with 90 days written notice
|
393 |
-
- Intellectual Property: All developed IP remains with Client
|
394 |
-
- Confidentiality: 5-year confidentiality period post-contract
|
395 |
|
396 |
DELIVERABLES:
|
397 |
-
1.
|
398 |
-
- Complete
|
399 |
-
- Deadline:
|
400 |
-
- Acceptance: Must pass security audit and performance tests
|
401 |
|
402 |
-
2. Mobile
|
403 |
- iOS and Android applications
|
404 |
-
- Deadline:
|
405 |
-
- Acceptance: App store approval and user acceptance testing
|
406 |
|
407 |
-
3.
|
408 |
-
-
|
409 |
-
- Deadline:
|
410 |
-
- Acceptance: Training completion by 95% of users"""
|
411 |
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
- Dynamic resource allocation ($0.01-$5.00 based on complexity)
|
422 |
-
- Confidence-based quality assessment with human review routing
|
423 |
-
|
424 |
-
π **Performance:** 97-99% time savings vs manual processing with 85-95% accuracy
|
425 |
-
""")
|
426 |
|
427 |
with gr.Row():
|
428 |
-
with gr.Column(
|
429 |
-
gr.Markdown("###
|
430 |
content_input = gr.Textbox(
|
431 |
-
label="
|
432 |
-
placeholder="
|
433 |
-
lines=
|
434 |
-
max_lines=
|
435 |
)
|
436 |
|
437 |
-
with gr.Column(
|
438 |
-
gr.Markdown("###
|
439 |
schema_input = gr.Textbox(
|
440 |
-
label="Target
|
441 |
-
placeholder="
|
442 |
-
lines=
|
443 |
-
max_lines=
|
444 |
value=github_schema
|
445 |
)
|
446 |
|
447 |
with gr.Row():
|
448 |
-
extract_btn = gr.Button("
|
449 |
-
clear_btn = gr.Button("
|
450 |
|
451 |
with gr.Row():
|
452 |
with gr.Column(scale=2):
|
453 |
-
gr.Markdown("###
|
454 |
output_json = gr.Textbox(
|
455 |
-
label="
|
456 |
-
lines=
|
457 |
-
max_lines=30,
|
458 |
show_copy_button=True
|
459 |
)
|
460 |
|
461 |
with gr.Column(scale=1):
|
462 |
-
gr.Markdown("###
|
463 |
-
confidence_output = gr.Textbox(label="
|
464 |
-
metadata_output = gr.Textbox(
|
465 |
-
|
466 |
-
lines=12,
|
467 |
-
interactive=False
|
468 |
-
)
|
469 |
-
status_output = gr.Textbox(label="Status", interactive=False)
|
470 |
|
471 |
-
gr.Markdown("###
|
472 |
gr.Examples(
|
473 |
examples=[
|
474 |
[github_content, github_schema],
|
475 |
[resume_content, resume_schema],
|
476 |
-
[
|
477 |
[contract_content, contract_schema]
|
478 |
],
|
479 |
inputs=[content_input, schema_input],
|
480 |
-
label="
|
481 |
-
examples_per_page=4
|
482 |
)
|
483 |
|
484 |
gr.Markdown("""
|
485 |
-
###
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
4. **Quality Assessment**: Provides confidence scores and flags uncertain fields for human review
|
491 |
-
|
492 |
-
### π Complexity Tiers
|
493 |
-
|
494 |
-
| Tier | Depth | Fields | Cost | Time | Use Case |
|
495 |
-
|------|-------|--------|------|------|----------|
|
496 |
-
| **1 (Simple)** | β€2 levels | β€20 | $0.01-0.05 | 5-15s | Forms, basic extraction |
|
497 |
-
| **2 (Medium)** | β€4 levels | β€100 | $0.08-0.25 | 15-45s | API docs, structured reports |
|
498 |
-
| **3 (Complex)** | >4 levels | >100 | $0.30-2.00 | 45-120s | Legal docs, research papers |
|
499 |
-
|
500 |
-
### π Schema Examples
|
501 |
-
|
502 |
-
**GitHub Actions** (Medium): Action metadata with inputs/outputs
|
503 |
-
**Resume/CV** (Complex): Personal profile with work history and skills
|
504 |
-
**Email Chains** (Complex): Requirements extraction from stakeholder communications
|
505 |
-
**Legal Contracts** (Complex): Contract terms, parties, and deliverables
|
506 |
""")
|
507 |
|
508 |
extract_btn.click(
|
|
|
16 |
progress(0.1, desc="Parsing schema...")
|
17 |
schema = json.loads(schema_text)
|
18 |
|
19 |
+
progress(0.5, desc="Processing...")
|
|
|
|
|
20 |
result = await system.extract_structured_data(content, schema)
|
21 |
|
|
|
|
|
22 |
extracted_data = json.dumps(result["data"], indent=2)
|
23 |
confidence = f"{result['overall_confidence']:.1%}"
|
24 |
metadata = result["extraction_metadata"]
|
25 |
|
26 |
+
analysis = f"""Complexity Tier: {metadata['complexity_tier']}
|
27 |
+
Processing Stages: {metadata['stages_executed']}
|
28 |
+
Estimated Cost: ${metadata['estimated_cost']:.3f}
|
29 |
+
Processing Time: {metadata['actual_processing_time']:.2f}s
|
30 |
+
Schema Compliance: {metadata['schema_compliance']:.1%}"""
|
|
|
|
|
|
|
31 |
|
|
|
32 |
if result["review_flags"]:
|
33 |
+
analysis += f"\nReview Flags: {', '.join(result['review_flags'])}"
|
34 |
+
analysis += f"\nReview Time: {metadata['recommended_review_time']} minutes"
|
|
|
|
|
35 |
|
36 |
+
progress(1.0, desc="Complete")
|
37 |
+
return extracted_data, confidence, analysis, "Success"
|
38 |
|
39 |
except json.JSONDecodeError as e:
|
40 |
+
return "", "0%", f"Invalid JSON Schema: {str(e)}", "Schema Error"
|
41 |
except Exception as e:
|
42 |
+
return "", "0%", f"Extraction Error: {str(e)}", "Error"
|
43 |
|
44 |
def extract_wrapper(content, schema_text):
|
45 |
return asyncio.run(extract_data(content, schema_text))
|
|
|
106 |
|
107 |
github_content = """MkDocs Publisher Action
|
108 |
|
109 |
+
I keep repeating the steps to build and deploy our MkDocs documentation sites to GitHub Pages across different repos. Let's create a reusable composite action to handle this.
|
|
|
110 |
|
111 |
+
Action Name: MkDocs Publisher
|
112 |
+
Purpose: A simple action to build an MkDocs site and push it to the gh-pages branch. Should be easy to use. Author should be listed as 'DevRel Team'.
|
113 |
|
114 |
+
Inputs Needed:
|
115 |
+
python-version: We need to specify the Python version for setting up the environment. Users should be able to choose. Let's make this optional and default it to 3.11. Description: 'The version of Python to set up for building.'
|
116 |
+
requirements-file: Users might have different requirements files (e.g., requirements.txt, docs/requirements.txt). This input should specify the path. It's required. Description: 'Path to the Python requirements file'.
|
117 |
+
gh-token: The GitHub token for pushing to gh-pages. This is absolutely required. Description: 'GitHub token for deployment.'
|
118 |
|
119 |
+
Outputs:
|
120 |
+
The action needs to output the URL where the site was deployed. Let's call this output page-url. Its description should be 'The URL of the deployed GitHub Pages site.'
|
121 |
|
122 |
+
How it Runs:
|
123 |
+
This will be a composite action (using: composite). Here are the steps involved:
|
124 |
+
Checkout Code: First, we need the repository code. Use the standard actions/checkout@v4.
|
125 |
+
Setup Python: Next, set up the Python environment. Use actions/setup-python@v5.
|
126 |
+
Install Dependencies: Run a command to install the Python packages. The command is pip install -r requirements.txt. Execute this using the bash shell.
|
127 |
+
Build Site: Run the command mkdocs build. Use bash for this too.
|
128 |
+
Deploy to Pages: Use peaceiris/actions-gh-pages@v3 for deployment.
|
129 |
|
130 |
+
Branding: For the marketplace look, let's use the color blue and the book-open icon."""
|
131 |
|
132 |
resume_schema = """{
|
133 |
"type": "object",
|
|
|
139 |
"label": {"type": "string"},
|
140 |
"email": {"type": "string"},
|
141 |
"phone": {"type": "string"},
|
142 |
+
"url": {"type": "string"},
|
143 |
"summary": {"type": "string"},
|
144 |
"location": {
|
145 |
"type": "object",
|
146 |
"properties": {
|
147 |
+
"address": {"type": "string"},
|
148 |
+
"postalCode": {"type": "string"},
|
149 |
"city": {"type": "string"},
|
150 |
+
"countryCode": {"type": "string"},
|
151 |
+
"region": {"type": "string"}
|
152 |
+
}
|
153 |
+
},
|
154 |
+
"profiles": {
|
155 |
+
"type": "array",
|
156 |
+
"items": {
|
157 |
+
"type": "object",
|
158 |
+
"properties": {
|
159 |
+
"network": {"type": "string"},
|
160 |
+
"username": {"type": "string"},
|
161 |
+
"url": {"type": "string"}
|
162 |
+
}
|
163 |
}
|
164 |
}
|
165 |
}
|
|
|
170 |
"type": "object",
|
171 |
"properties": {
|
172 |
"name": {"type": "string"},
|
|
|
173 |
"location": {"type": "string"},
|
174 |
+
"position": {"type": "string"},
|
175 |
"startDate": {"type": "string"},
|
176 |
"endDate": {"type": "string"},
|
177 |
"highlights": {
|
|
|
190 |
"area": {"type": "string"},
|
191 |
"studyType": {"type": "string"},
|
192 |
"startDate": {"type": "string"},
|
193 |
+
"endDate": {"type": "string"}
|
|
|
194 |
}
|
195 |
}
|
196 |
},
|
|
|
210 |
}
|
211 |
}"""
|
212 |
|
213 |
+
resume_content = """John Doe
|
214 |
+
Software Engineer
|
215 |
+
Email: [email protected]
|
216 |
+
Phone: +1-555-0123
|
217 |
+
Address: 123 Main St, San Francisco, CA 94105, US
|
218 |
+
Website: https://johndoe.dev
|
219 |
|
220 |
+
PROFESSIONAL SUMMARY
|
221 |
+
Experienced software engineer with 5+ years developing web applications and distributed systems. Skilled in full-stack development with expertise in Python, JavaScript, and cloud technologies.
|
222 |
|
223 |
WORK EXPERIENCE
|
224 |
|
225 |
+
Senior Software Engineer | TechCorp Inc | San Francisco, CA | 2022 - Present
|
226 |
+
- Designed and implemented microservices architecture serving 1M+ users
|
227 |
+
- Led development of real-time data processing pipeline using Apache Kafka
|
228 |
+
- Reduced system latency by 40% through performance optimization
|
229 |
|
230 |
+
Software Engineer | StartupXYZ | Palo Alto, CA | 2020 - 2022
|
231 |
+
- Built responsive web applications using React and Node.js
|
232 |
+
- Implemented CI/CD pipelines resulting in 50% faster deployment cycles
|
233 |
+
- Collaborated with cross-functional teams on product development
|
234 |
|
235 |
EDUCATION
|
236 |
|
237 |
+
Bachelor of Science in Computer Science | Stanford University | 2016 - 2020
|
238 |
+
- Relevant Coursework: Data Structures, Algorithms, Database Systems
|
239 |
+
- Senior Project: Machine Learning Platform for Predictive Analytics
|
240 |
|
241 |
+
TECHNICAL SKILLS
|
|
|
242 |
|
243 |
+
Programming Languages: Python, JavaScript, Java, Go, SQL
|
244 |
+
Web Technologies: React, Node.js, HTML/CSS, REST APIs, GraphQL
|
245 |
+
Cloud & DevOps: AWS, Docker, Kubernetes, Jenkins, Git"""
|
|
|
246 |
|
247 |
+
citation_schema = """{
|
248 |
"type": "object",
|
249 |
"properties": {
|
250 |
+
"cff-version": {"type": "string"},
|
251 |
+
"message": {"type": "string"},
|
252 |
+
"title": {"type": "string"},
|
253 |
+
"authors": {
|
254 |
"type": "array",
|
255 |
"items": {
|
256 |
"type": "object",
|
257 |
"properties": {
|
258 |
+
"given-names": {"type": "string"},
|
259 |
+
"family-names": {"type": "string"},
|
260 |
+
"affiliation": {"type": "string"},
|
261 |
+
"orcid": {"type": "string"}
|
262 |
}
|
263 |
}
|
264 |
},
|
265 |
+
"type": {"type": "string"},
|
266 |
+
"date-published": {"type": "string"},
|
267 |
+
"url": {"type": "string"},
|
268 |
+
"abstract": {"type": "string"},
|
269 |
+
"keywords": {
|
270 |
"type": "array",
|
271 |
+
"items": {"type": "string"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
},
|
273 |
+
"preferred-citation": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
"type": "object",
|
275 |
"properties": {
|
276 |
+
"type": {"type": "string"},
|
277 |
+
"title": {"type": "string"},
|
278 |
+
"authors": {
|
279 |
+
"type": "array",
|
280 |
+
"items": {
|
281 |
+
"type": "object",
|
282 |
+
"properties": {
|
283 |
+
"given-names": {"type": "string"},
|
284 |
+
"family-names": {"type": "string"}
|
285 |
+
}
|
286 |
+
}
|
287 |
+
},
|
288 |
+
"collection-title": {"type": "string"},
|
289 |
+
"volume": {"type": "integer"},
|
290 |
+
"year": {"type": "integer"},
|
291 |
+
"publisher": {
|
292 |
+
"type": "object",
|
293 |
+
"properties": {
|
294 |
+
"name": {"type": "string"}
|
295 |
+
}
|
296 |
+
}
|
297 |
}
|
298 |
}
|
299 |
}
|
300 |
}"""
|
301 |
|
302 |
+
citation_content = """Title: Attention Is All You Need
|
303 |
+
Authors: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Εukasz Kaiser, Illia Polosukhin
|
|
|
304 |
|
305 |
+
This paper introduces the Transformer, a novel neural network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.
|
306 |
|
307 |
+
Published in: Advances in Neural Information Processing Systems, Volume 30, 2017
|
308 |
+
Publisher: Curran Associates, Inc.
|
309 |
+
URL: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
|
310 |
|
311 |
+
The Transformer uses multi-head self-attention to draw global dependencies between input and output. The model achieves state-of-the-art results on machine translation tasks while being more parallelizable and requiring significantly less time to train.
|
|
|
|
|
|
|
312 |
|
313 |
+
Keywords: attention mechanism, transformer, neural networks, machine translation, self-attention
|
|
|
314 |
|
315 |
+
This work has become foundational for modern NLP models including BERT, GPT, and T5."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
contract_schema = """{
|
318 |
"type": "object",
|
319 |
"properties": {
|
320 |
+
"contract_type": {"type": "string"},
|
321 |
"parties": {
|
322 |
"type": "array",
|
323 |
"items": {
|
|
|
325 |
"properties": {
|
326 |
"name": {"type": "string"},
|
327 |
"type": {"type": "string"},
|
328 |
+
"address": {"type": "string"}
|
|
|
329 |
}
|
330 |
}
|
331 |
},
|
332 |
+
"contract_value": {"type": "string"},
|
333 |
+
"payment_terms": {"type": "string"},
|
334 |
+
"duration": {"type": "string"},
|
335 |
+
"start_date": {"type": "string"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
"deliverables": {
|
337 |
"type": "array",
|
338 |
"items": {
|
339 |
"type": "object",
|
340 |
"properties": {
|
341 |
"name": {"type": "string"},
|
|
|
342 |
"deadline": {"type": "string"},
|
343 |
+
"description": {"type": "string"}
|
344 |
}
|
345 |
}
|
346 |
+
},
|
347 |
+
"key_terms": {
|
348 |
+
"type": "object",
|
349 |
+
"properties": {
|
350 |
+
"liability_cap": {"type": "string"},
|
351 |
+
"termination_notice": {"type": "string"},
|
352 |
+
"intellectual_property": {"type": "string"}
|
353 |
+
}
|
354 |
}
|
355 |
}
|
356 |
}"""
|
357 |
|
358 |
contract_content = """SOFTWARE DEVELOPMENT AGREEMENT
|
359 |
|
360 |
+
This Agreement is made between:
|
361 |
+
Client: TechCorp Inc., 123 Business Ave, San Francisco, CA 94105
|
362 |
+
Contractor: DevStudio LLC, 456 Developer St, Austin, TX 78701
|
363 |
|
364 |
+
Contract Value: $150,000
|
365 |
+
Payment Terms: 50% upfront, 50% upon completion
|
366 |
+
Duration: 6 months
|
367 |
+
Start Date: January 1, 2024
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
|
369 |
DELIVERABLES:
|
370 |
+
1. Web Application Development
|
371 |
+
- Complete e-commerce platform with user authentication
|
372 |
+
- Deadline: March 15, 2024
|
|
|
373 |
|
374 |
+
2. Mobile App Development
|
375 |
- iOS and Android applications
|
376 |
+
- Deadline: May 1, 2024
|
|
|
377 |
|
378 |
+
3. API Integration
|
379 |
+
- Third-party payment processing integration
|
380 |
+
- Deadline: April 15, 2024
|
|
|
381 |
|
382 |
+
KEY TERMS:
|
383 |
+
- Liability is capped at the total contract value
|
384 |
+
- Either party may terminate with 30 days written notice
|
385 |
+
- All intellectual property developed under this agreement belongs to the Client
|
386 |
+
- Contractor agrees to maintain confidentiality of all proprietary information"""
|
387 |
+
|
388 |
+
with gr.Blocks(title="Unstructured to Structured Converter", theme=gr.themes.Default()) as app:
|
389 |
+
gr.Markdown("# Unstructured to Structured JSON Converter")
|
390 |
+
gr.Markdown("Convert any unstructured text into structured JSON following complex schemas")
|
|
|
|
|
|
|
|
|
|
|
391 |
|
392 |
with gr.Row():
|
393 |
+
with gr.Column():
|
394 |
+
gr.Markdown("### Input Content")
|
395 |
content_input = gr.Textbox(
|
396 |
+
label="Document Content",
|
397 |
+
placeholder="Enter your unstructured text here...",
|
398 |
+
lines=12,
|
399 |
+
max_lines=20
|
400 |
)
|
401 |
|
402 |
+
with gr.Column():
|
403 |
+
gr.Markdown("### JSON Schema")
|
404 |
schema_input = gr.Textbox(
|
405 |
+
label="Target Schema",
|
406 |
+
placeholder="Enter your JSON schema here...",
|
407 |
+
lines=12,
|
408 |
+
max_lines=20,
|
409 |
value=github_schema
|
410 |
)
|
411 |
|
412 |
with gr.Row():
|
413 |
+
extract_btn = gr.Button("Extract Data", variant="primary")
|
414 |
+
clear_btn = gr.Button("Clear")
|
415 |
|
416 |
with gr.Row():
|
417 |
with gr.Column(scale=2):
|
418 |
+
gr.Markdown("### Extracted Data")
|
419 |
output_json = gr.Textbox(
|
420 |
+
label="JSON Output",
|
421 |
+
lines=15,
|
|
|
422 |
show_copy_button=True
|
423 |
)
|
424 |
|
425 |
with gr.Column(scale=1):
|
426 |
+
gr.Markdown("### Results")
|
427 |
+
confidence_output = gr.Textbox(label="Confidence")
|
428 |
+
metadata_output = gr.Textbox(label="Analysis", lines=8)
|
429 |
+
status_output = gr.Textbox(label="Status")
|
|
|
|
|
|
|
|
|
430 |
|
431 |
+
gr.Markdown("### Test Cases")
|
432 |
gr.Examples(
|
433 |
examples=[
|
434 |
[github_content, github_schema],
|
435 |
[resume_content, resume_schema],
|
436 |
+
[citation_content, citation_schema],
|
437 |
[contract_content, contract_schema]
|
438 |
],
|
439 |
inputs=[content_input, schema_input],
|
440 |
+
label="Select a test case:"
|
|
|
441 |
)
|
442 |
|
443 |
gr.Markdown("""
|
444 |
+
### System Features
|
445 |
+
- **Schema Complexity**: Supports 6+ levels nesting, 250+ fields, unlimited enums
|
446 |
+
- **Document Size**: Handles 50+ page documents and 10MB+ files
|
447 |
+
- **Dynamic Scaling**: Cost ranges from $0.01 to $5.00 based on complexity
|
448 |
+
- **Quality Assurance**: Confidence scoring with human review routing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
""")
|
450 |
|
451 |
extract_btn.click(
|