File size: 15,727 Bytes
9943368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857328d
9943368
 
 
 
 
91a9da3
 
 
 
 
 
857328d
 
 
9943368
91a9da3
 
 
 
 
 
 
 
 
 
 
9943368
857328d
91a9da3
9943368
 
91a9da3
9943368
91a9da3
9943368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857328d
9943368
857328d
 
9943368
857328d
 
 
 
9943368
857328d
 
9943368
857328d
 
 
 
 
 
 
9943368
857328d
9943368
 
 
 
 
 
 
 
 
 
 
857328d
9943368
 
 
 
857328d
 
9943368
857328d
 
 
 
 
 
 
 
 
 
 
 
 
9943368
 
 
 
 
 
 
 
 
 
 
857328d
9943368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857328d
9943368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857328d
 
 
 
 
 
9943368
857328d
 
9943368
 
 
857328d
 
 
 
9943368
857328d
 
 
 
9943368
 
 
857328d
 
 
9943368
857328d
9943368
857328d
 
 
9943368
857328d
9943368
 
857328d
 
 
 
9943368
 
 
 
857328d
 
 
 
9943368
 
 
857328d
 
 
 
 
9943368
857328d
9943368
857328d
9943368
 
857328d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9943368
 
 
 
 
857328d
 
9943368
857328d
9943368
857328d
 
 
9943368
857328d
9943368
857328d
9943368
857328d
9943368
 
 
 
857328d
9943368
 
 
 
 
 
 
857328d
9943368
 
 
857328d
 
 
 
9943368
 
 
 
 
 
 
857328d
9943368
 
857328d
 
 
 
 
 
 
 
9943368
 
 
 
 
 
857328d
 
 
9943368
857328d
 
 
 
9943368
 
857328d
 
 
9943368
857328d
9943368
857328d
9943368
857328d
 
 
9943368
857328d
 
 
 
 
 
 
 
 
9943368
 
857328d
 
9943368
857328d
 
 
 
9943368
 
857328d
 
9943368
857328d
 
 
 
9943368
 
 
 
857328d
 
9943368
 
 
857328d
9943368
857328d
 
9943368
 
 
 
857328d
 
 
9943368
857328d
9943368
 
 
 
857328d
9943368
 
 
857328d
9943368
 
 
857328d
 
 
 
 
9943368
 
 
 
 
91a9da3
9943368
 
 
 
91a9da3
9943368
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
import gradio as gr
import json
import asyncio
import os
from main import StructuredExtractionSystem
import time

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY environment variable is required")

system = StructuredExtractionSystem(api_key)

async def extract_data(content, schema_text, progress=gr.Progress()):
    try:
        progress(0.1, desc="Parsing schema...")
        schema = json.loads(schema_text)
        
        progress(0.5, desc="Processing...")
        result = await system.extract_structured_data(content, schema)
        
        extracted_data = json.dumps(result["data"], indent=2)
        metadata = result["extraction_metadata"]
        
        total_expected = len(schema.get('properties', {}))
        extracted_count = len([k for k, v in result["data"].items() if v is not None and v != ""])
        completeness = extracted_count / total_expected if total_expected > 0 else 0
        
        analysis = f"""Fields Extracted: {extracted_count}/{total_expected} ({completeness:.1%})
Complexity Tier: {metadata['complexity_tier']}
Processing Stages: {metadata['stages_executed']}
Processing Time: {metadata['actual_processing_time']:.2f}s
Schema Compliance: {metadata['schema_compliance']:.1%}"""
        
        status_flags = result.get("review_flags", [])
        
        if completeness >= 0.8 and not any(flag in ["incomplete_extraction", "low_quality", "schema_violations"] for flag in status_flags):
            status = "Success"
        elif completeness >= 0.5:
            status = "Partial Success"
        else:
            status = "Incomplete"
        
        if status_flags:
            analysis += f"\nIssues: {', '.join(status_flags)}"
        
        progress(1.0, desc="Complete")
        return extracted_data, analysis, status
        
    except json.JSONDecodeError as e:
        return "", f"Invalid JSON Schema: {str(e)}", "Schema Error"
    except Exception as e:
        return "", f"Extraction Error: {str(e)}", "Error"

def extract_wrapper(content, schema_text):
    return asyncio.run(extract_data(content, schema_text))

github_schema = """{
  "type": "object",
  "properties": {
    "name": {"type": "string"},
    "description": {"type": "string"},
    "author": {"type": "string"},
    "inputs": {
      "type": "object",
      "patternProperties": {
        "^[_a-zA-Z][a-zA-Z0-9_-]*$": {
          "type": "object",
          "properties": {
            "description": {"type": "string"},
            "required": {"type": "boolean"},
            "default": {"type": "string"}
          }
        }
      }
    },
    "outputs": {
      "type": "object",
      "patternProperties": {
        "^[_a-zA-Z][a-zA-Z0-9_-]*$": {
          "type": "object",
          "properties": {
            "description": {"type": "string"},
            "value": {"type": "string"}
          }
        }
      }
    },
    "runs": {
      "type": "object",
      "properties": {
        "using": {"type": "string"},
        "steps": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "name": {"type": "string"},
              "uses": {"type": "string"},
              "run": {"type": "string"},
              "shell": {"type": "string"}
            }
          }
        }
      }
    },
    "branding": {
      "type": "object",
      "properties": {
        "color": {"type": "string"},
        "icon": {"type": "string"}
      }
    }
  },
  "required": ["name", "description", "runs"]
}"""

github_content = """MkDocs Publisher Action

I keep repeating the steps to build and deploy our MkDocs documentation sites to GitHub Pages across different repos. Let's create a reusable composite action to handle this.

Action Name: MkDocs Publisher
Purpose: A simple action to build an MkDocs site and push it to the gh-pages branch. Should be easy to use. Author should be listed as 'DevRel Team'.

Inputs Needed:
python-version: We need to specify the Python version for setting up the environment. Users should be able to choose. Let's make this optional and default it to 3.11. Description: 'The version of Python to set up for building.'
requirements-file: Users might have different requirements files (e.g., requirements.txt, docs/requirements.txt). This input should specify the path. It's required. Description: 'Path to the Python requirements file'.
gh-token: The GitHub token for pushing to gh-pages. This is absolutely required. Description: 'GitHub token for deployment.'

Outputs:
The action needs to output the URL where the site was deployed. Let's call this output page-url. Its description should be 'The URL of the deployed GitHub Pages site.'

How it Runs:
This will be a composite action (using: composite). Here are the steps involved:
Checkout Code: First, we need the repository code. Use the standard actions/checkout@v4.
Setup Python: Next, set up the Python environment. Use actions/setup-python@v5.
Install Dependencies: Run a command to install the Python packages. The command is pip install -r requirements.txt. Execute this using the bash shell.
Build Site: Run the command mkdocs build. Use bash for this too.
Deploy to Pages: Use peaceiris/actions-gh-pages@v3 for deployment.

Branding: For the marketplace look, let's use the color blue and the book-open icon."""

resume_schema = """{
  "type": "object",
  "properties": {
    "basics": {
      "type": "object",
      "properties": {
        "name": {"type": "string"},
        "label": {"type": "string"},
        "email": {"type": "string"},
        "phone": {"type": "string"},
        "url": {"type": "string"},
        "summary": {"type": "string"},
        "location": {
          "type": "object",
          "properties": {
            "address": {"type": "string"},
            "postalCode": {"type": "string"},
            "city": {"type": "string"},
            "countryCode": {"type": "string"},
            "region": {"type": "string"}
          }
        },
        "profiles": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "network": {"type": "string"},
              "username": {"type": "string"},
              "url": {"type": "string"}
            }
          }
        }
      }
    },
    "work": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": {"type": "string"},
          "location": {"type": "string"},
          "position": {"type": "string"},
          "startDate": {"type": "string"},
          "endDate": {"type": "string"},
          "highlights": {
            "type": "array",
            "items": {"type": "string"}
          }
        }
      }
    },
    "education": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "institution": {"type": "string"},
          "area": {"type": "string"},
          "studyType": {"type": "string"},
          "startDate": {"type": "string"},
          "endDate": {"type": "string"}
        }
      }
    },
    "skills": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": {"type": "string"},
          "keywords": {
            "type": "array",
            "items": {"type": "string"}
          }
        }
      }
    }
  }
}"""

resume_content = """John Doe
Software Engineer
Email: [email protected]
Phone: +1-555-0123
Address: 123 Main St, San Francisco, CA 94105, US
Website: https://johndoe.dev

PROFESSIONAL SUMMARY
Experienced software engineer with 5+ years developing web applications and distributed systems. Skilled in full-stack development with expertise in Python, JavaScript, and cloud technologies.

WORK EXPERIENCE

Senior Software Engineer | TechCorp Inc | San Francisco, CA | 2022 - Present
- Designed and implemented microservices architecture serving 1M+ users
- Led development of real-time data processing pipeline using Apache Kafka
- Reduced system latency by 40% through performance optimization

Software Engineer | StartupXYZ | Palo Alto, CA | 2020 - 2022  
- Built responsive web applications using React and Node.js
- Implemented CI/CD pipelines resulting in 50% faster deployment cycles
- Collaborated with cross-functional teams on product development

EDUCATION

Bachelor of Science in Computer Science | Stanford University | 2016 - 2020
- Relevant Coursework: Data Structures, Algorithms, Database Systems
- Senior Project: Machine Learning Platform for Predictive Analytics

TECHNICAL SKILLS

Programming Languages: Python, JavaScript, Java, Go, SQL
Web Technologies: React, Node.js, HTML/CSS, REST APIs, GraphQL
Cloud & DevOps: AWS, Docker, Kubernetes, Jenkins, Git"""

citation_schema = """{
  "type": "object",
  "properties": {
    "cff-version": {"type": "string"},
    "message": {"type": "string"},
    "title": {"type": "string"},
    "authors": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "given-names": {"type": "string"},
          "family-names": {"type": "string"},
          "affiliation": {"type": "string"},
          "orcid": {"type": "string"}
        }
      }
    },
    "type": {"type": "string"},
    "date-published": {"type": "string"},
    "url": {"type": "string"},
    "abstract": {"type": "string"},
    "keywords": {
      "type": "array",
      "items": {"type": "string"}
    },
    "preferred-citation": {
      "type": "object",
      "properties": {
        "type": {"type": "string"},
        "title": {"type": "string"},
        "authors": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "given-names": {"type": "string"},
              "family-names": {"type": "string"}
            }
          }
        },
        "collection-title": {"type": "string"},
        "volume": {"type": "integer"},
        "year": {"type": "integer"},
        "publisher": {
          "type": "object",
          "properties": {
            "name": {"type": "string"}
          }
        }
      }
    }
  }
}"""

citation_content = """Title: Attention Is All You Need
Authors: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Łukasz Kaiser, Illia Polosukhin

This paper introduces the Transformer, a novel neural network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.

Published in: Advances in Neural Information Processing Systems, Volume 30, 2017
Publisher: Curran Associates, Inc.
URL: https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf

The Transformer uses multi-head self-attention to draw global dependencies between input and output. The model achieves state-of-the-art results on machine translation tasks while being more parallelizable and requiring significantly less time to train.

Keywords: attention mechanism, transformer, neural networks, machine translation, self-attention

This work has become foundational for modern NLP models including BERT, GPT, and T5."""

contract_schema = """{
  "type": "object",
  "properties": {
    "contract_type": {"type": "string"},
    "parties": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": {"type": "string"},
          "type": {"type": "string"},
          "address": {"type": "string"}
        }
      }
    },
    "contract_value": {"type": "string"},
    "payment_terms": {"type": "string"},
    "duration": {"type": "string"},
    "start_date": {"type": "string"},
    "deliverables": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": {"type": "string"},
          "deadline": {"type": "string"},
          "description": {"type": "string"}
        }
      }
    },
    "key_terms": {
      "type": "object",
      "properties": {
        "liability_cap": {"type": "string"},
        "termination_notice": {"type": "string"},
        "intellectual_property": {"type": "string"}
      }
    }
  }
}"""

contract_content = """SOFTWARE DEVELOPMENT AGREEMENT

This Agreement is made between:
Client: TechCorp Inc., 123 Business Ave, San Francisco, CA 94105
Contractor: DevStudio LLC, 456 Developer St, Austin, TX 78701

Contract Value: $150,000
Payment Terms: 50% upfront, 50% upon completion
Duration: 6 months
Start Date: January 1, 2024

DELIVERABLES:
1. Web Application Development
   - Complete e-commerce platform with user authentication
   - Deadline: March 15, 2024

2. Mobile App Development  
   - iOS and Android applications
   - Deadline: May 1, 2024

3. API Integration
   - Third-party payment processing integration
   - Deadline: April 15, 2024

KEY TERMS:
- Liability is capped at the total contract value
- Either party may terminate with 30 days written notice
- All intellectual property developed under this agreement belongs to the Client
- Contractor agrees to maintain confidentiality of all proprietary information"""

with gr.Blocks(title="Unstructured to Structured Converter", theme=gr.themes.Default()) as app:
    gr.Markdown("# Unstructured to Structured JSON Converter")
    gr.Markdown("Convert any unstructured text into structured JSON following complex schemas")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Input Content")
            content_input = gr.Textbox(
                label="Document Content",
                placeholder="Enter your unstructured text here...",
                lines=12,
                max_lines=20
            )
            
        with gr.Column():
            gr.Markdown("### JSON Schema")
            schema_input = gr.Textbox(
                label="Target Schema",
                placeholder="Enter your JSON schema here...",
                lines=12,
                max_lines=20,
                value=github_schema
            )
    
    with gr.Row():
        extract_btn = gr.Button("Extract Data", variant="primary")
        clear_btn = gr.Button("Clear")
    
    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("### Extracted Data")
            output_json = gr.Textbox(
                label="JSON Output",
                lines=15,
                show_copy_button=True
            )
            
        with gr.Column(scale=1):
            gr.Markdown("### Results")
            metadata_output = gr.Textbox(label="Analysis", lines=8)
            status_output = gr.Textbox(label="Status")
    
    gr.Markdown("### Test Cases")
    gr.Examples(
        examples=[
            [github_content, github_schema],
            [resume_content, resume_schema],
            [citation_content, citation_schema],
            [contract_content, contract_schema]
        ],
        inputs=[content_input, schema_input],
        label="Select a test case:"
    )
    
    gr.Markdown("""
    ### System Features
    - **Schema Complexity**: Supports 6+ levels nesting, 250+ fields, unlimited enums
    - **Document Size**: Handles 50+ page documents and 10MB+ files
    - **Dynamic Scaling**: Cost ranges from $0.01 to $5.00 based on complexity
    - **Quality Assurance**: Confidence scoring with human review routing
    """)
    
    extract_btn.click(
        fn=extract_wrapper,
        inputs=[content_input, schema_input],
        outputs=[output_json, metadata_output, status_output]
    )
    
    clear_btn.click(
        lambda: ("", "", "", "", ""),
        outputs=[content_input, schema_input, output_json, metadata_output, status_output]
    )

if __name__ == "__main__":
    app.launch()