Spaces:

Tonic
/

Convert-to-Json

Running on Zero

App Files Files Community

Tonic commited on 1 day ago

Commit

09a71ed

unverified ·

1 Parent(s): 4df882a

adds docstrings, schemas passed to the system prompt, and improved examples

Browse files

Files changed (1) hide show

app.py +94 -16

app.py CHANGED Viewed

@@ -17,7 +17,6 @@ description = """
         - **Architecture**: Qwen3 (specialized for structured data)
         - **Purpose**: Converting unstructured text to structured JSON format
         - **Optimizations**: Fine-tuned for data extraction and format conversion tasks
-        - **Access**: Requires HF authentication token for gated repository
         The model automatically identifies key information in your text and organizes it into logical JSON structures.
         """
@@ -34,7 +33,22 @@ model = None
 tokenizer = None
 def load_model():
-    """Load the Osmosis Structure model and tokenizer with HF token for gated repos"""
     global model, tokenizer
     try:
@@ -88,8 +102,33 @@ def load_model():
         return False
 @spaces.GPU
-def text_to_json(input_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=20):
-    """Convert plain text to structured JSON using Osmosis Structure model"""
     global model, tokenizer
     if model is None or tokenizer is None:
@@ -97,10 +136,15 @@ def text_to_json(input_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=
     try:
         # Create a structured prompt for JSON conversion
         messages = [
             {
                 "role": "system",
-                "content": "You are a helpful assistant that converts unstructured text into well-formatted JSON. Extract key information and organize it into a logical, structured format. Always respond with valid JSON."
             },
             {
                 "role": "user",
@@ -187,6 +231,22 @@ def text_to_json(input_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=
         return f"❌ Error generating JSON: {str(e)}"
 def create_demo():
     # Fixed: Remove duplicate with gr.Blocks declaration
     with gr.Blocks(
         title=title,
@@ -211,7 +271,14 @@ def create_demo():
             with gr.Column(scale=1):
                 input_text = gr.Textbox(
                     label="📝 Input Text",
-                    placeholder="Enter your unstructured text here...\n\nExample: 'John Smith is a 30-year-old software engineer from New York. He works at Tech Corp and has 5 years of experience in Python development.'",
                     lines=8,
                     max_lines=15
                 )
@@ -268,16 +335,27 @@ def create_demo():
                     show_copy_button=True
                 )
-        # Example inputs
-        gr.Markdown("### 📚 Example Inputs")
-        examples = gr.Examples(
             examples=[
-                ["John Smith is a 30-year-old software engineer from New York. He works at Tech Corp and has 5 years of experience in Python development. His email is [email protected] and he graduated from MIT in 2018."],
-                ["Order #12345 was placed on March 15, 2024. Customer: Sarah Johnson, Address: 123 Main St, Boston MA 02101. Items: 2x Laptop ($999 each), 1x Mouse ($25). Total: $2023. Status: Shipped via FedEx, tracking: 1234567890."],
-                ["The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact [email protected] for questions."],
-                ["Product: Wireless Headphones Model XYZ-100. Price: $199.99. Features: Bluetooth 5.0, 30-hour battery, noise cancellation, wireless charging case. Colors available: Black, White, Blue. Warranty: 2 years. Rating: 4.5/5 stars (324 reviews)."]
             ],
-            inputs=input_text,
             label="Click on any example to try it"
         )
@@ -285,7 +363,7 @@ def create_demo():
         # Event handlers
         convert_btn.click(
             fn=text_to_json,
-            inputs=[input_text, max_tokens, temperature, top_p, top_k],
             outputs=output_json,
             show_progress=True
         )
@@ -293,7 +371,7 @@ def create_demo():
         # Allow Enter key to trigger conversion
         input_text.submit(
             fn=text_to_json,
-            inputs=[input_text, max_tokens, temperature, top_p, top_k],
             outputs=output_json,
             show_progress=True
         )

         - **Architecture**: Qwen3 (specialized for structured data)
         - **Purpose**: Converting unstructured text to structured JSON format
         - **Optimizations**: Fine-tuned for data extraction and format conversion tasks
         The model automatically identifies key information in your text and organizes it into logical JSON structures.
         """
 tokenizer = None
 def load_model():
+    """Load the Osmosis Structure model and tokenizer with HF token for gated repos.
+    This function initializes the global model and tokenizer variables by loading them from Hugging Face.
+    It handles authentication using the HF_KEY environment variable and provides helpful error messages
+    for common issues like authentication failures or model not found errors.
+    Returns:
+        bool: True if model and tokenizer were loaded successfully, False otherwise.
+    Example:
+        >>> success = load_model()
+        >>> if success:
+        ...     print("Model loaded successfully!")
+        ... else:
+        ...     print("Failed to load model")
+    """
     global model, tokenizer
     try:
         return False
 @spaces.GPU
+def text_to_json(input_text, schema_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=20):
+    """Convert plain text to structured JSON using Osmosis Structure model.
+    This function takes unstructured text and optionally a JSON schema, then uses the Osmosis Structure
+    model to convert it into well-formatted JSON. The output will follow the provided schema if one is
+    given, otherwise it will create a logical structure based on the input text.
+    Args:
+        input_text (str): The unstructured text to convert to JSON.
+        schema_text (str): Optional JSON schema that defines the desired output structure.
+        max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 512.
+        temperature (float, optional): Controls randomness in generation. Defaults to 0.6.
+        top_p (float, optional): Nucleus sampling parameter. Defaults to 0.95.
+        top_k (int, optional): Number of highest probability tokens to consider. Defaults to 20.
+    Returns:
+        str: A JSON string containing the structured data, or an error message if something went wrong.
+    Example:
+        >>> input_text = "The conference will be held on June 10-12, 2024 at the Grand Hotel."
+        >>> schema = '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}}}'
+        >>> result = text_to_json(input_text, schema)
+        >>> print(result)
+        {
+          "event_start_date": "2024-06-10"
+        }
+    """
     global model, tokenizer
     if model is None or tokenizer is None:
     try:
         # Create a structured prompt for JSON conversion
+        system_prompt = "You are a helpful assistant that converts unstructured text into well-formatted JSON. Extract key information and organize it into a logical, structured format. Always respond with valid JSON."
+        if schema_text and schema_text.strip():
+            system_prompt = f"You are a helpful assistant that understands and translates text to JSON format according to the following schema. {schema_text}"
         messages = [
             {
                 "role": "system",
+                "content": system_prompt
             },
             {
                 "role": "user",
         return f"❌ Error generating JSON: {str(e)}"
 def create_demo():
+    """Create and configure the Gradio demo interface.
+    This function sets up the Gradio interface with all necessary components:
+    - Input text area for unstructured text
+    - Schema input area for JSON schema
+    - Generation settings controls
+    - Output display area
+    - Example inputs with corresponding schemas
+    Returns:
+        gr.Blocks: A configured Gradio interface ready to be launched.
+    Example:
+        >>> demo = create_demo()
+        >>> demo.launch()
+    """
     # Fixed: Remove duplicate with gr.Blocks declaration
     with gr.Blocks(
         title=title,
             with gr.Column(scale=1):
                 input_text = gr.Textbox(
                     label="📝 Input Text",
+                    placeholder="Enter your unstructured text here...\n\nExample: 'The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact [email protected] for questions.'",
+                    lines=8,
+                    max_lines=15
+                )
+                schema_text = gr.Textbox(
+                    label="📋 JSON Schema (Optional)",
+                    placeholder="Enter your JSON schema here...\n\nExample: {\"type\": \"object\", \"properties\": {\"event_start_date\": {\"type\": \"string\", \"format\": \"date\"}, \"event_end_date\": {\"type\": \"string\", \"format\": \"date\"}, \"location\": {\"type\": \"string\"}, \"registration_fees\": {\"type\": \"object\", \"properties\": {\"early_bird_price\": {\"type\": \"number\"}, \"regular_price\": {\"type\": \"number\"}, \"early_bird_deadline\": {\"type\": \"string\", \"format\": \"date\"}}}, \"contact_email\": {\"type\": \"string\"}}}",
                     lines=8,
                     max_lines=15
                 )
                     show_copy_button=True
                 )
+        # Examples section
+        gr.Examples(
             examples=[
+                [
+                    "The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact [email protected] for questions.",
+                    '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "registration_fees": {"type": "object", "properties": {"early_bird_price": {"type": "number"}, "regular_price": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "contact_email": {"type": "string"}}}'
+                ],
+                [
+                    "The workshop is scheduled for March 15-16, 2024 at Tech Hub in Seattle. Early bird tickets cost $299 until February 15, after which regular tickets will be $399. For inquiries, email [email protected]",
+                    '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "registration_fees": {"type": "object", "properties": {"early_bird_price": {"type": "number"}, "regular_price": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "contact_email": {"type": "string"}}}'
+                ],
+                [
+                    "Product: Wireless Headphones Model XYZ-100. Price: $199.99. Features: Bluetooth 5.0, 30-hour battery, noise cancellation, wireless charging case. Colors available: Black, White, Blue. Warranty: 2 years. Rating: 4.5/5 stars (324 reviews).",
+                    '{"type": "object", "properties": {"product_name": {"type": "string"}, "price": {"type": "number"}, "features": {"type": "array", "items": {"type": "string"}}, "colors": {"type": "array", "items": {"type": "string"}}, "warranty_years": {"type": "number"}, "rating": {"type": "object", "properties": {"score": {"type": "number"}, "reviews": {"type": "number"}}}}}'
+                ],
+                [
+                    "The summer festival runs from July 1-5, 2024 at Central Park. VIP passes are $150 until June 1, then $200. General admission is $75 early bird (until June 15) and $100 regular. Contact [email protected]",
+                    '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "ticket_prices": {"type": "object", "properties": {"vip": {"type": "object", "properties": {"early_bird": {"type": "number"}, "regular": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "general": {"type": "object", "properties": {"early_bird": {"type": "number"}, "regular": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}}}, "contact_email": {"type": "string"}}}'
+                ]
             ],
+            inputs=[input_text, schema_text],
             label="Click on any example to try it"
         )
         # Event handlers
         convert_btn.click(
             fn=text_to_json,
+            inputs=[input_text, schema_text, max_tokens, temperature, top_p, top_k],
             outputs=output_json,
             show_progress=True
         )
         # Allow Enter key to trigger conversion
         input_text.submit(
             fn=text_to_json,
+            inputs=[input_text, schema_text, max_tokens, temperature, top_p, top_k],
             outputs=output_json,
             show_progress=True
         )