Tonic commited on
Commit
09a71ed
Β·
unverified Β·
1 Parent(s): 4df882a

adds docstrings, schemas passed to the system prompt, and improved examples

Browse files
Files changed (1) hide show
  1. app.py +94 -16
app.py CHANGED
@@ -17,7 +17,6 @@ description = """
17
  - **Architecture**: Qwen3 (specialized for structured data)
18
  - **Purpose**: Converting unstructured text to structured JSON format
19
  - **Optimizations**: Fine-tuned for data extraction and format conversion tasks
20
- - **Access**: Requires HF authentication token for gated repository
21
 
22
  The model automatically identifies key information in your text and organizes it into logical JSON structures.
23
  """
@@ -34,7 +33,22 @@ model = None
34
  tokenizer = None
35
 
36
  def load_model():
37
- """Load the Osmosis Structure model and tokenizer with HF token for gated repos"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  global model, tokenizer
39
 
40
  try:
@@ -88,8 +102,33 @@ def load_model():
88
  return False
89
 
90
  @spaces.GPU
91
- def text_to_json(input_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=20):
92
- """Convert plain text to structured JSON using Osmosis Structure model"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  global model, tokenizer
94
 
95
  if model is None or tokenizer is None:
@@ -97,10 +136,15 @@ def text_to_json(input_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=
97
 
98
  try:
99
  # Create a structured prompt for JSON conversion
 
 
 
 
 
100
  messages = [
101
  {
102
  "role": "system",
103
- "content": "You are a helpful assistant that converts unstructured text into well-formatted JSON. Extract key information and organize it into a logical, structured format. Always respond with valid JSON."
104
  },
105
  {
106
  "role": "user",
@@ -187,6 +231,22 @@ def text_to_json(input_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=
187
  return f"❌ Error generating JSON: {str(e)}"
188
 
189
  def create_demo():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  # Fixed: Remove duplicate with gr.Blocks declaration
191
  with gr.Blocks(
192
  title=title,
@@ -211,7 +271,14 @@ def create_demo():
211
  with gr.Column(scale=1):
212
  input_text = gr.Textbox(
213
  label="πŸ“ Input Text",
214
- placeholder="Enter your unstructured text here...\n\nExample: 'John Smith is a 30-year-old software engineer from New York. He works at Tech Corp and has 5 years of experience in Python development.'",
 
 
 
 
 
 
 
215
  lines=8,
216
  max_lines=15
217
  )
@@ -268,16 +335,27 @@ def create_demo():
268
  show_copy_button=True
269
  )
270
 
271
- # Example inputs
272
- gr.Markdown("### πŸ“š Example Inputs")
273
- examples = gr.Examples(
274
  examples=[
275
- ["John Smith is a 30-year-old software engineer from New York. He works at Tech Corp and has 5 years of experience in Python development. His email is [email protected] and he graduated from MIT in 2018."],
276
- ["Order #12345 was placed on March 15, 2024. Customer: Sarah Johnson, Address: 123 Main St, Boston MA 02101. Items: 2x Laptop ($999 each), 1x Mouse ($25). Total: $2023. Status: Shipped via FedEx, tracking: 1234567890."],
277
- ["The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact [email protected] for questions."],
278
- ["Product: Wireless Headphones Model XYZ-100. Price: $199.99. Features: Bluetooth 5.0, 30-hour battery, noise cancellation, wireless charging case. Colors available: Black, White, Blue. Warranty: 2 years. Rating: 4.5/5 stars (324 reviews)."]
 
 
 
 
 
 
 
 
 
 
 
 
279
  ],
280
- inputs=input_text,
281
  label="Click on any example to try it"
282
  )
283
 
@@ -285,7 +363,7 @@ def create_demo():
285
  # Event handlers
286
  convert_btn.click(
287
  fn=text_to_json,
288
- inputs=[input_text, max_tokens, temperature, top_p, top_k],
289
  outputs=output_json,
290
  show_progress=True
291
  )
@@ -293,7 +371,7 @@ def create_demo():
293
  # Allow Enter key to trigger conversion
294
  input_text.submit(
295
  fn=text_to_json,
296
- inputs=[input_text, max_tokens, temperature, top_p, top_k],
297
  outputs=output_json,
298
  show_progress=True
299
  )
 
17
  - **Architecture**: Qwen3 (specialized for structured data)
18
  - **Purpose**: Converting unstructured text to structured JSON format
19
  - **Optimizations**: Fine-tuned for data extraction and format conversion tasks
 
20
 
21
  The model automatically identifies key information in your text and organizes it into logical JSON structures.
22
  """
 
33
  tokenizer = None
34
 
35
  def load_model():
36
+ """Load the Osmosis Structure model and tokenizer with HF token for gated repos.
37
+
38
+ This function initializes the global model and tokenizer variables by loading them from Hugging Face.
39
+ It handles authentication using the HF_KEY environment variable and provides helpful error messages
40
+ for common issues like authentication failures or model not found errors.
41
+
42
+ Returns:
43
+ bool: True if model and tokenizer were loaded successfully, False otherwise.
44
+
45
+ Example:
46
+ >>> success = load_model()
47
+ >>> if success:
48
+ ... print("Model loaded successfully!")
49
+ ... else:
50
+ ... print("Failed to load model")
51
+ """
52
  global model, tokenizer
53
 
54
  try:
 
102
  return False
103
 
104
  @spaces.GPU
105
+ def text_to_json(input_text, schema_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=20):
106
+ """Convert plain text to structured JSON using Osmosis Structure model.
107
+
108
+ This function takes unstructured text and optionally a JSON schema, then uses the Osmosis Structure
109
+ model to convert it into well-formatted JSON. The output will follow the provided schema if one is
110
+ given, otherwise it will create a logical structure based on the input text.
111
+
112
+ Args:
113
+ input_text (str): The unstructured text to convert to JSON.
114
+ schema_text (str): Optional JSON schema that defines the desired output structure.
115
+ max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 512.
116
+ temperature (float, optional): Controls randomness in generation. Defaults to 0.6.
117
+ top_p (float, optional): Nucleus sampling parameter. Defaults to 0.95.
118
+ top_k (int, optional): Number of highest probability tokens to consider. Defaults to 20.
119
+
120
+ Returns:
121
+ str: A JSON string containing the structured data, or an error message if something went wrong.
122
+
123
+ Example:
124
+ >>> input_text = "The conference will be held on June 10-12, 2024 at the Grand Hotel."
125
+ >>> schema = '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}}}'
126
+ >>> result = text_to_json(input_text, schema)
127
+ >>> print(result)
128
+ {
129
+ "event_start_date": "2024-06-10"
130
+ }
131
+ """
132
  global model, tokenizer
133
 
134
  if model is None or tokenizer is None:
 
136
 
137
  try:
138
  # Create a structured prompt for JSON conversion
139
+ system_prompt = "You are a helpful assistant that converts unstructured text into well-formatted JSON. Extract key information and organize it into a logical, structured format. Always respond with valid JSON."
140
+
141
+ if schema_text and schema_text.strip():
142
+ system_prompt = f"You are a helpful assistant that understands and translates text to JSON format according to the following schema. {schema_text}"
143
+
144
  messages = [
145
  {
146
  "role": "system",
147
+ "content": system_prompt
148
  },
149
  {
150
  "role": "user",
 
231
  return f"❌ Error generating JSON: {str(e)}"
232
 
233
  def create_demo():
234
+ """Create and configure the Gradio demo interface.
235
+
236
+ This function sets up the Gradio interface with all necessary components:
237
+ - Input text area for unstructured text
238
+ - Schema input area for JSON schema
239
+ - Generation settings controls
240
+ - Output display area
241
+ - Example inputs with corresponding schemas
242
+
243
+ Returns:
244
+ gr.Blocks: A configured Gradio interface ready to be launched.
245
+
246
+ Example:
247
+ >>> demo = create_demo()
248
+ >>> demo.launch()
249
+ """
250
  # Fixed: Remove duplicate with gr.Blocks declaration
251
  with gr.Blocks(
252
  title=title,
 
271
  with gr.Column(scale=1):
272
  input_text = gr.Textbox(
273
  label="πŸ“ Input Text",
274
+ placeholder="Enter your unstructured text here...\n\nExample: 'The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact [email protected] for questions.'",
275
+ lines=8,
276
+ max_lines=15
277
+ )
278
+
279
+ schema_text = gr.Textbox(
280
+ label="πŸ“‹ JSON Schema (Optional)",
281
+ placeholder="Enter your JSON schema here...\n\nExample: {\"type\": \"object\", \"properties\": {\"event_start_date\": {\"type\": \"string\", \"format\": \"date\"}, \"event_end_date\": {\"type\": \"string\", \"format\": \"date\"}, \"location\": {\"type\": \"string\"}, \"registration_fees\": {\"type\": \"object\", \"properties\": {\"early_bird_price\": {\"type\": \"number\"}, \"regular_price\": {\"type\": \"number\"}, \"early_bird_deadline\": {\"type\": \"string\", \"format\": \"date\"}}}, \"contact_email\": {\"type\": \"string\"}}}",
282
  lines=8,
283
  max_lines=15
284
  )
 
335
  show_copy_button=True
336
  )
337
 
338
+ # Examples section
339
+ gr.Examples(
 
340
  examples=[
341
+ [
342
+ "The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact [email protected] for questions.",
343
+ '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "registration_fees": {"type": "object", "properties": {"early_bird_price": {"type": "number"}, "regular_price": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "contact_email": {"type": "string"}}}'
344
+ ],
345
+ [
346
+ "The workshop is scheduled for March 15-16, 2024 at Tech Hub in Seattle. Early bird tickets cost $299 until February 15, after which regular tickets will be $399. For inquiries, email [email protected]",
347
+ '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "registration_fees": {"type": "object", "properties": {"early_bird_price": {"type": "number"}, "regular_price": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "contact_email": {"type": "string"}}}'
348
+ ],
349
+ [
350
+ "Product: Wireless Headphones Model XYZ-100. Price: $199.99. Features: Bluetooth 5.0, 30-hour battery, noise cancellation, wireless charging case. Colors available: Black, White, Blue. Warranty: 2 years. Rating: 4.5/5 stars (324 reviews).",
351
+ '{"type": "object", "properties": {"product_name": {"type": "string"}, "price": {"type": "number"}, "features": {"type": "array", "items": {"type": "string"}}, "colors": {"type": "array", "items": {"type": "string"}}, "warranty_years": {"type": "number"}, "rating": {"type": "object", "properties": {"score": {"type": "number"}, "reviews": {"type": "number"}}}}}'
352
+ ],
353
+ [
354
+ "The summer festival runs from July 1-5, 2024 at Central Park. VIP passes are $150 until June 1, then $200. General admission is $75 early bird (until June 15) and $100 regular. Contact [email protected]",
355
+ '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "ticket_prices": {"type": "object", "properties": {"vip": {"type": "object", "properties": {"early_bird": {"type": "number"}, "regular": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "general": {"type": "object", "properties": {"early_bird": {"type": "number"}, "regular": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}}}, "contact_email": {"type": "string"}}}'
356
+ ]
357
  ],
358
+ inputs=[input_text, schema_text],
359
  label="Click on any example to try it"
360
  )
361
 
 
363
  # Event handlers
364
  convert_btn.click(
365
  fn=text_to_json,
366
+ inputs=[input_text, schema_text, max_tokens, temperature, top_p, top_k],
367
  outputs=output_json,
368
  show_progress=True
369
  )
 
371
  # Allow Enter key to trigger conversion
372
  input_text.submit(
373
  fn=text_to_json,
374
+ inputs=[input_text, schema_text, max_tokens, temperature, top_p, top_k],
375
  outputs=output_json,
376
  show_progress=True
377
  )