arjunanand13 commited on
Commit
857328d
Β·
verified Β·
1 Parent(s): 74609eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +184 -241
app.py CHANGED
@@ -16,39 +16,30 @@ async def extract_data(content, schema_text, progress=gr.Progress()):
16
  progress(0.1, desc="Parsing schema...")
17
  schema = json.loads(schema_text)
18
 
19
- progress(0.3, desc="Analyzing complexity...")
20
-
21
- progress(0.5, desc="Extracting data...")
22
  result = await system.extract_structured_data(content, schema)
23
 
24
- progress(0.9, desc="Finalizing results...")
25
-
26
  extracted_data = json.dumps(result["data"], indent=2)
27
  confidence = f"{result['overall_confidence']:.1%}"
28
  metadata = result["extraction_metadata"]
29
 
30
- complexity_info = f"""
31
- **Schema Analysis:**
32
- - Complexity Tier: {metadata['complexity_tier']}
33
- - Processing Stages: {metadata['stages_executed']}
34
- - Estimated Cost: ${metadata['estimated_cost']:.3f}
35
- - Processing Time: {metadata['actual_processing_time']:.2f}s
36
- - Schema Compliance: {metadata['schema_compliance']:.1%}
37
- """
38
 
39
- review_info = ""
40
  if result["review_flags"]:
41
- review_info = f"\n**Review Required:** {', '.join(result['review_flags'])}"
42
- review_info += f"\nEstimated Review Time: {metadata['recommended_review_time']} minutes"
43
-
44
- progress(1.0, desc="Complete!")
45
 
46
- return extracted_data, confidence, complexity_info + review_info, "βœ… Success"
 
47
 
48
  except json.JSONDecodeError as e:
49
- return "", "0%", f"❌ Invalid JSON Schema: {str(e)}", "❌ Schema Error"
50
  except Exception as e:
51
- return "", "0%", f"❌ Extraction Error: {str(e)}", "❌ Error"
52
 
53
  def extract_wrapper(content, schema_text):
54
  return asyncio.run(extract_data(content, schema_text))
@@ -115,26 +106,28 @@ github_schema = """{
115
 
116
  github_content = """MkDocs Publisher Action
117
 
118
- This is a composite action that builds an MkDocs documentation site and deploys it to GitHub Pages.
119
- It's designed to be reusable across multiple repositories.
120
 
121
- Author: DevRel Team
 
122
 
123
- The action requires:
124
- - python-version: Python version to use (default: 3.11)
125
- - requirements-file: Path to requirements file (required)
126
- - gh-token: GitHub token for deployment (required)
127
 
128
- The action outputs the URL where the site was deployed.
 
129
 
130
- The action runs these steps:
131
- 1. Checkout the repository code using actions/checkout@v4
132
- 2. Setup Python environment using actions/setup-python@v5
133
- 3. Install dependencies: pip install -r requirements.txt
134
- 4. Build the MkDocs site: mkdocs build
135
- 5. Deploy to GitHub Pages using peaceiris/actions-gh-pages@v3
 
136
 
137
- Branding: Use blue color with book-open icon."""
138
 
139
  resume_schema = """{
140
  "type": "object",
@@ -146,14 +139,27 @@ resume_schema = """{
146
  "label": {"type": "string"},
147
  "email": {"type": "string"},
148
  "phone": {"type": "string"},
149
- "website": {"type": "string"},
150
  "summary": {"type": "string"},
151
  "location": {
152
  "type": "object",
153
  "properties": {
 
 
154
  "city": {"type": "string"},
155
- "region": {"type": "string"},
156
- "countryCode": {"type": "string"}
 
 
 
 
 
 
 
 
 
 
 
157
  }
158
  }
159
  }
@@ -164,8 +170,8 @@ resume_schema = """{
164
  "type": "object",
165
  "properties": {
166
  "name": {"type": "string"},
167
- "position": {"type": "string"},
168
  "location": {"type": "string"},
 
169
  "startDate": {"type": "string"},
170
  "endDate": {"type": "string"},
171
  "highlights": {
@@ -184,8 +190,7 @@ resume_schema = """{
184
  "area": {"type": "string"},
185
  "studyType": {"type": "string"},
186
  "startDate": {"type": "string"},
187
- "endDate": {"type": "string"},
188
- "score": {"type": "string"}
189
  }
190
  }
191
  },
@@ -205,124 +210,114 @@ resume_schema = """{
205
  }
206
  }"""
207
 
208
- resume_content = """Sarah Chen - Senior AI Research Scientist
209
- Email: [email protected] | Phone: +1-555-0123
210
- Location: Palo Alto, California, United States
211
- Website: https://sarahchen.ai
 
 
212
 
213
- SUMMARY
214
- Experienced AI research scientist with 8+ years in machine learning, deep learning, and natural language processing. Led teams that developed production ML systems serving millions of users.
215
 
216
  WORK EXPERIENCE
217
 
218
- Senior AI Research Scientist | OpenAI | 2021 - Present | San Francisco, CA
219
- β€’ Led development of GPT-4 training infrastructure, improving training efficiency by 40%
220
- β€’ Designed novel attention mechanisms for transformer architectures
221
- β€’ Managed team of 12 researchers across multiple ML projects
222
 
223
- Machine Learning Engineer | Google Brain | 2019 - 2021 | Mountain View, CA
224
- β€’ Developed recommendation systems serving 500M+ users daily
225
- β€’ Implemented distributed training frameworks for large-scale models
226
- β€’ Reduced model inference latency by 60% through optimization techniques
227
 
228
  EDUCATION
229
 
230
- Ph.D. Computer Science | Stanford University | 2013 - 2017 | Stanford, CA
231
- Dissertation: "Efficient Training of Large-Scale Neural Networks"
232
- GPA: 3.95/4.0
233
 
234
- M.S. Computer Science | MIT | 2011 - 2013 | Cambridge, MA
235
- Concentration: Artificial Intelligence | GPA: 3.9/4.0
236
 
237
- SKILLS
238
- Programming: Python, C++, JavaScript, CUDA, PyTorch, TensorFlow
239
- Machine Learning: Deep Learning, NLP, Computer Vision, Reinforcement Learning
240
- Cloud Platforms: AWS, GCP, Azure, Kubernetes, Docker"""
241
 
242
- email_schema = """{
243
  "type": "object",
244
  "properties": {
245
- "participants": {
 
 
 
246
  "type": "array",
247
  "items": {
248
  "type": "object",
249
  "properties": {
250
- "name": {"type": "string"},
251
- "email": {"type": "string"},
252
- "role": {"type": "string"},
253
- "organization": {"type": "string"}
254
  }
255
  }
256
  },
257
- "requirements": {
 
 
 
 
258
  "type": "array",
259
- "items": {
260
- "type": "object",
261
- "properties": {
262
- "id": {"type": "string"},
263
- "description": {"type": "string"},
264
- "priority": {"type": "string"},
265
- "status": {"type": "string"},
266
- "source_stakeholder": {"type": "string"}
267
- }
268
- }
269
  },
270
- "decisions": {
271
- "type": "array",
272
- "items": {
273
- "type": "object",
274
- "properties": {
275
- "decision": {"type": "string"},
276
- "rationale": {"type": "string"},
277
- "stakeholders_involved": {"type": "array", "items": {"type": "string"}},
278
- "implementation_impact": {"type": "string"}
279
- }
280
- }
281
- },
282
- "timeline": {
283
  "type": "object",
284
  "properties": {
285
- "start_date": {"type": "string"},
286
- "key_milestones": {"type": "array", "items": {"type": "string"}},
287
- "final_deadline": {"type": "string"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  }
289
  }
290
  }
291
  }"""
292
 
293
- email_content = """From: [email protected]
294
295
- Subject: API Rate Limiting Requirements - Final Decision
296
 
297
- Hi team,
298
 
299
- After our discussion yesterday, I wanted to confirm the final requirements for the API rate limiting feature:
 
 
300
 
301
- REQ-001: Implement per-user rate limiting at 1000 requests/hour (HIGH priority)
302
- REQ-002: Add burst capacity of 100 requests/minute (MEDIUM priority)
303
- REQ-003: Provide rate limit headers in API responses (HIGH priority)
304
- REQ-004: Create rate limit monitoring dashboard (LOW priority)
305
 
306
- Decision: We'll use Redis for rate limiting storage instead of in-memory due to scalability concerns raised by Mike.
307
- Rationale: Redis provides persistence and can scale across multiple API instances.
308
 
309
- Implementation impact: Will require Redis infrastructure setup but provides better long-term scalability.
310
-
311
- Timeline:
312
- - Start development: January 15, 2024
313
- - Feature complete: February 28, 2024
314
- - Production deployment: March 15, 2024
315
-
316
- Let me know if you have any questions.
317
-
318
- Best regards,
319
- John Smith - Product Manager, Acme Corp
320
- Sarah Johnson - Lead Engineer, TechCorp
321
- Mike Brown - DevOps Lead, Acme Corp"""
322
 
323
  contract_schema = """{
324
  "type": "object",
325
  "properties": {
 
326
  "parties": {
327
  "type": "array",
328
  "items": {
@@ -330,179 +325,127 @@ contract_schema = """{
330
  "properties": {
331
  "name": {"type": "string"},
332
  "type": {"type": "string"},
333
- "address": {"type": "string"},
334
- "role": {"type": "string"}
335
  }
336
  }
337
  },
338
- "contract_details": {
339
- "type": "object",
340
- "properties": {
341
- "contract_value": {"type": "string"},
342
- "currency": {"type": "string"},
343
- "payment_terms": {"type": "string"},
344
- "contract_duration": {"type": "string"},
345
- "start_date": {"type": "string"},
346
- "end_date": {"type": "string"}
347
- }
348
- },
349
- "key_terms": {
350
- "type": "object",
351
- "properties": {
352
- "liability_cap": {"type": "string"},
353
- "termination_clause": {"type": "string"},
354
- "intellectual_property": {"type": "string"},
355
- "confidentiality_period": {"type": "string"}
356
- }
357
- },
358
  "deliverables": {
359
  "type": "array",
360
  "items": {
361
  "type": "object",
362
  "properties": {
363
  "name": {"type": "string"},
364
- "description": {"type": "string"},
365
  "deadline": {"type": "string"},
366
- "acceptance_criteria": {"type": "string"}
367
  }
368
  }
 
 
 
 
 
 
 
 
369
  }
370
  }
371
  }"""
372
 
373
  contract_content = """SOFTWARE DEVELOPMENT AGREEMENT
374
 
375
- This Software Development Agreement ("Agreement") is entered into on January 1, 2024, between:
 
 
376
 
377
- TechCorp Inc., a Delaware corporation with offices at 123 Silicon Valley Blvd, San Francisco, CA 94105 ("Client")
378
-
379
- AND
380
-
381
- DevSolutions LLC, a California limited liability company with offices at 456 Innovation Drive, Palo Alto, CA 94301 ("Developer")
382
-
383
- CONTRACT TERMS:
384
- - Total Contract Value: $2,500,000 USD
385
- - Payment Terms: Net 30 days
386
- - Contract Duration: 18 months
387
- - Start Date: January 15, 2024
388
- - End Date: July 15, 2025
389
-
390
- KEY PROVISIONS:
391
- - Liability Cap: Limited to total contract value ($2.5M)
392
- - Termination: Either party may terminate with 90 days written notice
393
- - Intellectual Property: All developed IP remains with Client
394
- - Confidentiality: 5-year confidentiality period post-contract
395
 
396
  DELIVERABLES:
397
- 1. API Platform Development
398
- - Complete REST API platform with authentication
399
- - Deadline: June 1, 2024
400
- - Acceptance: Must pass security audit and performance tests
401
 
402
- 2. Mobile Application
403
  - iOS and Android applications
404
- - Deadline: October 1, 2024
405
- - Acceptance: App store approval and user acceptance testing
406
 
407
- 3. Documentation & Training
408
- - Complete technical documentation and user training
409
- - Deadline: December 1, 2024
410
- - Acceptance: Training completion by 95% of users"""
411
 
412
- with gr.Blocks(title="Unstructured to Structured JSON Converter", theme=gr.themes.Soft()) as app:
413
- gr.Markdown("""
414
- # πŸ”„ Unstructured to Structured JSON Converter
415
-
416
- **A production-ready system for extracting structured data from unstructured text following complex JSON schemas.**
417
-
418
- ✨ **Key Features:**
419
- - Supports unlimited schema complexity (6+ levels, 250+ fields, 500+ enums)
420
- - Handles large documents (50+ pages, 10MB+ files)
421
- - Dynamic resource allocation ($0.01-$5.00 based on complexity)
422
- - Confidence-based quality assessment with human review routing
423
-
424
- πŸ“Š **Performance:** 97-99% time savings vs manual processing with 85-95% accuracy
425
- """)
426
 
427
  with gr.Row():
428
- with gr.Column(scale=1):
429
- gr.Markdown("### πŸ“ Input Content")
430
  content_input = gr.Textbox(
431
- label="Unstructured Text Content",
432
- placeholder="Paste your document content here...",
433
- lines=15,
434
- max_lines=25
435
  )
436
 
437
- with gr.Column(scale=1):
438
- gr.Markdown("### πŸ—‚οΈ JSON Schema")
439
  schema_input = gr.Textbox(
440
- label="Target JSON Schema",
441
- placeholder="Paste your JSON schema here...",
442
- lines=15,
443
- max_lines=25,
444
  value=github_schema
445
  )
446
 
447
  with gr.Row():
448
- extract_btn = gr.Button("πŸš€ Extract Structured Data", variant="primary", size="lg")
449
- clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
450
 
451
  with gr.Row():
452
  with gr.Column(scale=2):
453
- gr.Markdown("### πŸ“€ Extracted JSON Data")
454
  output_json = gr.Textbox(
455
- label="Structured Output",
456
- lines=20,
457
- max_lines=30,
458
  show_copy_button=True
459
  )
460
 
461
  with gr.Column(scale=1):
462
- gr.Markdown("### πŸ“Š Analysis Results")
463
- confidence_output = gr.Textbox(label="Overall Confidence", interactive=False)
464
- metadata_output = gr.Textbox(
465
- label="Processing Metadata",
466
- lines=12,
467
- interactive=False
468
- )
469
- status_output = gr.Textbox(label="Status", interactive=False)
470
 
471
- gr.Markdown("### 🎯 Example Test Cases")
472
  gr.Examples(
473
  examples=[
474
  [github_content, github_schema],
475
  [resume_content, resume_schema],
476
- [email_content, email_schema],
477
  [contract_content, contract_schema]
478
  ],
479
  inputs=[content_input, schema_input],
480
- label="Click any example to load it:",
481
- examples_per_page=4
482
  )
483
 
484
  gr.Markdown("""
485
- ### πŸ”§ How It Works
486
-
487
- 1. **Schema Analysis**: Analyzes complexity (depth, fields, objects) and creates optimal extraction plan
488
- 2. **Document Processing**: Handles large documents with semantic chunking and context preservation
489
- 3. **Multi-Stage Extraction**: Uses hierarchical processing with dynamic model selection
490
- 4. **Quality Assessment**: Provides confidence scores and flags uncertain fields for human review
491
-
492
- ### πŸ“ˆ Complexity Tiers
493
-
494
- | Tier | Depth | Fields | Cost | Time | Use Case |
495
- |------|-------|--------|------|------|----------|
496
- | **1 (Simple)** | ≀2 levels | ≀20 | $0.01-0.05 | 5-15s | Forms, basic extraction |
497
- | **2 (Medium)** | ≀4 levels | ≀100 | $0.08-0.25 | 15-45s | API docs, structured reports |
498
- | **3 (Complex)** | >4 levels | >100 | $0.30-2.00 | 45-120s | Legal docs, research papers |
499
-
500
- ### πŸŽ“ Schema Examples
501
-
502
- **GitHub Actions** (Medium): Action metadata with inputs/outputs
503
- **Resume/CV** (Complex): Personal profile with work history and skills
504
- **Email Chains** (Complex): Requirements extraction from stakeholder communications
505
- **Legal Contracts** (Complex): Contract terms, parties, and deliverables
506
  """)
507
 
508
  extract_btn.click(
 
16
  progress(0.1, desc="Parsing schema...")
17
  schema = json.loads(schema_text)
18
 
19
+ progress(0.5, desc="Processing...")
 
 
20
  result = await system.extract_structured_data(content, schema)
21
 
 
 
22
  extracted_data = json.dumps(result["data"], indent=2)
23
  confidence = f"{result['overall_confidence']:.1%}"
24
  metadata = result["extraction_metadata"]
25
 
26
+ analysis = f"""Complexity Tier: {metadata['complexity_tier']}
27
+ Processing Stages: {metadata['stages_executed']}
28
+ Estimated Cost: ${metadata['estimated_cost']:.3f}
29
+ Processing Time: {metadata['actual_processing_time']:.2f}s
30
+ Schema Compliance: {metadata['schema_compliance']:.1%}"""
 
 
 
31
 
 
32
  if result["review_flags"]:
33
+ analysis += f"\nReview Flags: {', '.join(result['review_flags'])}"
34
+ analysis += f"\nReview Time: {metadata['recommended_review_time']} minutes"
 
 
35
 
36
+ progress(1.0, desc="Complete")
37
+ return extracted_data, confidence, analysis, "Success"
38
 
39
  except json.JSONDecodeError as e:
40
+ return "", "0%", f"Invalid JSON Schema: {str(e)}", "Schema Error"
41
  except Exception as e:
42
+ return "", "0%", f"Extraction Error: {str(e)}", "Error"
43
 
44
  def extract_wrapper(content, schema_text):
45
  return asyncio.run(extract_data(content, schema_text))
 
106
 
107
  github_content = """MkDocs Publisher Action
108
 
109
+ I keep repeating the steps to build and deploy our MkDocs documentation sites to GitHub Pages across different repos. Let's create a reusable composite action to handle this.
 
110
 
111
+ Action Name: MkDocs Publisher
112
+ Purpose: A simple action to build an MkDocs site and push it to the gh-pages branch. Should be easy to use. Author should be listed as 'DevRel Team'.
113
 
114
+ Inputs Needed:
115
+ python-version: We need to specify the Python version for setting up the environment. Users should be able to choose. Let's make this optional and default it to 3.11. Description: 'The version of Python to set up for building.'
116
+ requirements-file: Users might have different requirements files (e.g., requirements.txt, docs/requirements.txt). This input should specify the path. It's required. Description: 'Path to the Python requirements file'.
117
+ gh-token: The GitHub token for pushing to gh-pages. This is absolutely required. Description: 'GitHub token for deployment.'
118
 
119
+ Outputs:
120
+ The action needs to output the URL where the site was deployed. Let's call this output page-url. Its description should be 'The URL of the deployed GitHub Pages site.'
121
 
122
+ How it Runs:
123
+ This will be a composite action (using: composite). Here are the steps involved:
124
+ Checkout Code: First, we need the repository code. Use the standard actions/checkout@v4.
125
+ Setup Python: Next, set up the Python environment. Use actions/setup-python@v5.
126
+ Install Dependencies: Run a command to install the Python packages. The command is pip install -r requirements.txt. Execute this using the bash shell.
127
+ Build Site: Run the command mkdocs build. Use bash for this too.
128
+ Deploy to Pages: Use peaceiris/actions-gh-pages@v3 for deployment.
129
 
130
+ Branding: For the marketplace look, let's use the color blue and the book-open icon."""
131
 
132
  resume_schema = """{
133
  "type": "object",
 
139
  "label": {"type": "string"},
140
  "email": {"type": "string"},
141
  "phone": {"type": "string"},
142
+ "url": {"type": "string"},
143
  "summary": {"type": "string"},
144
  "location": {
145
  "type": "object",
146
  "properties": {
147
+ "address": {"type": "string"},
148
+ "postalCode": {"type": "string"},
149
  "city": {"type": "string"},
150
+ "countryCode": {"type": "string"},
151
+ "region": {"type": "string"}
152
+ }
153
+ },
154
+ "profiles": {
155
+ "type": "array",
156
+ "items": {
157
+ "type": "object",
158
+ "properties": {
159
+ "network": {"type": "string"},
160
+ "username": {"type": "string"},
161
+ "url": {"type": "string"}
162
+ }
163
  }
164
  }
165
  }
 
170
  "type": "object",
171
  "properties": {
172
  "name": {"type": "string"},
 
173
  "location": {"type": "string"},
174
+ "position": {"type": "string"},
175
  "startDate": {"type": "string"},
176
  "endDate": {"type": "string"},
177
  "highlights": {
 
190
  "area": {"type": "string"},
191
  "studyType": {"type": "string"},
192
  "startDate": {"type": "string"},
193
+ "endDate": {"type": "string"}
 
194
  }
195
  }
196
  },
 
210
  }
211
  }"""
212
 
213
+ resume_content = """John Doe
214
+ Software Engineer
215
216
+ Phone: +1-555-0123
217
+ Address: 123 Main St, San Francisco, CA 94105, US
218
+ Website: https://johndoe.dev
219
 
220
+ PROFESSIONAL SUMMARY
221
+ Experienced software engineer with 5+ years developing web applications and distributed systems. Skilled in full-stack development with expertise in Python, JavaScript, and cloud technologies.
222
 
223
  WORK EXPERIENCE
224
 
225
+ Senior Software Engineer | TechCorp Inc | San Francisco, CA | 2022 - Present
226
+ - Designed and implemented microservices architecture serving 1M+ users
227
+ - Led development of real-time data processing pipeline using Apache Kafka
228
+ - Reduced system latency by 40% through performance optimization
229
 
230
+ Software Engineer | StartupXYZ | Palo Alto, CA | 2020 - 2022
231
+ - Built responsive web applications using React and Node.js
232
+ - Implemented CI/CD pipelines resulting in 50% faster deployment cycles
233
+ - Collaborated with cross-functional teams on product development
234
 
235
  EDUCATION
236
 
237
+ Bachelor of Science in Computer Science | Stanford University | 2016 - 2020
238
+ - Relevant Coursework: Data Structures, Algorithms, Database Systems
239
+ - Senior Project: Machine Learning Platform for Predictive Analytics
240
 
241
+ TECHNICAL SKILLS
 
242
 
243
+ Programming Languages: Python, JavaScript, Java, Go, SQL
244
+ Web Technologies: React, Node.js, HTML/CSS, REST APIs, GraphQL
245
+ Cloud & DevOps: AWS, Docker, Kubernetes, Jenkins, Git"""
 
246
 
247
+ citation_schema = """{
248
  "type": "object",
249
  "properties": {
250
+ "cff-version": {"type": "string"},
251
+ "message": {"type": "string"},
252
+ "title": {"type": "string"},
253
+ "authors": {
254
  "type": "array",
255
  "items": {
256
  "type": "object",
257
  "properties": {
258
+ "given-names": {"type": "string"},
259
+ "family-names": {"type": "string"},
260
+ "affiliation": {"type": "string"},
261
+ "orcid": {"type": "string"}
262
  }
263
  }
264
  },
265
+ "type": {"type": "string"},
266
+ "date-published": {"type": "string"},
267
+ "url": {"type": "string"},
268
+ "abstract": {"type": "string"},
269
+ "keywords": {
270
  "type": "array",
271
+ "items": {"type": "string"}
 
 
 
 
 
 
 
 
 
272
  },
273
+ "preferred-citation": {
 
 
 
 
 
 
 
 
 
 
 
 
274
  "type": "object",
275
  "properties": {
276
+ "type": {"type": "string"},
277
+ "title": {"type": "string"},
278
+ "authors": {
279
+ "type": "array",
280
+ "items": {
281
+ "type": "object",
282
+ "properties": {
283
+ "given-names": {"type": "string"},
284
+ "family-names": {"type": "string"}
285
+ }
286
+ }
287
+ },
288
+ "collection-title": {"type": "string"},
289
+ "volume": {"type": "integer"},
290
+ "year": {"type": "integer"},
291
+ "publisher": {
292
+ "type": "object",
293
+ "properties": {
294
+ "name": {"type": "string"}
295
+ }
296
+ }
297
  }
298
  }
299
  }
300
  }"""
301
 
302
+ citation_content = """Title: Attention Is All You Need
303
+ Authors: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Łukasz Kaiser, Illia Polosukhin
 
304
 
305
+ This paper introduces the Transformer, a novel neural network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.
306
 
307
+ Published in: Advances in Neural Information Processing Systems, Volume 30, 2017
308
+ Publisher: Curran Associates, Inc.
309
+ URL: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
310
 
311
+ The Transformer uses multi-head self-attention to draw global dependencies between input and output. The model achieves state-of-the-art results on machine translation tasks while being more parallelizable and requiring significantly less time to train.
 
 
 
312
 
313
+ Keywords: attention mechanism, transformer, neural networks, machine translation, self-attention
 
314
 
315
+ This work has become foundational for modern NLP models including BERT, GPT, and T5."""
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  contract_schema = """{
318
  "type": "object",
319
  "properties": {
320
+ "contract_type": {"type": "string"},
321
  "parties": {
322
  "type": "array",
323
  "items": {
 
325
  "properties": {
326
  "name": {"type": "string"},
327
  "type": {"type": "string"},
328
+ "address": {"type": "string"}
 
329
  }
330
  }
331
  },
332
+ "contract_value": {"type": "string"},
333
+ "payment_terms": {"type": "string"},
334
+ "duration": {"type": "string"},
335
+ "start_date": {"type": "string"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  "deliverables": {
337
  "type": "array",
338
  "items": {
339
  "type": "object",
340
  "properties": {
341
  "name": {"type": "string"},
 
342
  "deadline": {"type": "string"},
343
+ "description": {"type": "string"}
344
  }
345
  }
346
+ },
347
+ "key_terms": {
348
+ "type": "object",
349
+ "properties": {
350
+ "liability_cap": {"type": "string"},
351
+ "termination_notice": {"type": "string"},
352
+ "intellectual_property": {"type": "string"}
353
+ }
354
  }
355
  }
356
  }"""
357
 
358
  contract_content = """SOFTWARE DEVELOPMENT AGREEMENT
359
 
360
+ This Agreement is made between:
361
+ Client: TechCorp Inc., 123 Business Ave, San Francisco, CA 94105
362
+ Contractor: DevStudio LLC, 456 Developer St, Austin, TX 78701
363
 
364
+ Contract Value: $150,000
365
+ Payment Terms: 50% upfront, 50% upon completion
366
+ Duration: 6 months
367
+ Start Date: January 1, 2024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
  DELIVERABLES:
370
+ 1. Web Application Development
371
+ - Complete e-commerce platform with user authentication
372
+ - Deadline: March 15, 2024
 
373
 
374
+ 2. Mobile App Development
375
  - iOS and Android applications
376
+ - Deadline: May 1, 2024
 
377
 
378
+ 3. API Integration
379
+ - Third-party payment processing integration
380
+ - Deadline: April 15, 2024
 
381
 
382
+ KEY TERMS:
383
+ - Liability is capped at the total contract value
384
+ - Either party may terminate with 30 days written notice
385
+ - All intellectual property developed under this agreement belongs to the Client
386
+ - Contractor agrees to maintain confidentiality of all proprietary information"""
387
+
388
+ with gr.Blocks(title="Unstructured to Structured Converter", theme=gr.themes.Default()) as app:
389
+ gr.Markdown("# Unstructured to Structured JSON Converter")
390
+ gr.Markdown("Convert any unstructured text into structured JSON following complex schemas")
 
 
 
 
 
391
 
392
  with gr.Row():
393
+ with gr.Column():
394
+ gr.Markdown("### Input Content")
395
  content_input = gr.Textbox(
396
+ label="Document Content",
397
+ placeholder="Enter your unstructured text here...",
398
+ lines=12,
399
+ max_lines=20
400
  )
401
 
402
+ with gr.Column():
403
+ gr.Markdown("### JSON Schema")
404
  schema_input = gr.Textbox(
405
+ label="Target Schema",
406
+ placeholder="Enter your JSON schema here...",
407
+ lines=12,
408
+ max_lines=20,
409
  value=github_schema
410
  )
411
 
412
  with gr.Row():
413
+ extract_btn = gr.Button("Extract Data", variant="primary")
414
+ clear_btn = gr.Button("Clear")
415
 
416
  with gr.Row():
417
  with gr.Column(scale=2):
418
+ gr.Markdown("### Extracted Data")
419
  output_json = gr.Textbox(
420
+ label="JSON Output",
421
+ lines=15,
 
422
  show_copy_button=True
423
  )
424
 
425
  with gr.Column(scale=1):
426
+ gr.Markdown("### Results")
427
+ confidence_output = gr.Textbox(label="Confidence")
428
+ metadata_output = gr.Textbox(label="Analysis", lines=8)
429
+ status_output = gr.Textbox(label="Status")
 
 
 
 
430
 
431
+ gr.Markdown("### Test Cases")
432
  gr.Examples(
433
  examples=[
434
  [github_content, github_schema],
435
  [resume_content, resume_schema],
436
+ [citation_content, citation_schema],
437
  [contract_content, contract_schema]
438
  ],
439
  inputs=[content_input, schema_input],
440
+ label="Select a test case:"
 
441
  )
442
 
443
  gr.Markdown("""
444
+ ### System Features
445
+ - **Schema Complexity**: Supports 6+ levels nesting, 250+ fields, unlimited enums
446
+ - **Document Size**: Handles 50+ page documents and 10MB+ files
447
+ - **Dynamic Scaling**: Cost ranges from $0.01 to $5.00 based on complexity
448
+ - **Quality Assurance**: Confidence scoring with human review routing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  """)
450
 
451
  extract_btn.click(