Convert-to-Json / app.py
Tonic's picture
add demo with mcp enabled
ab48ce6 unverified
raw
history blame
10.4 kB
import gradio as gr
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces
# Model configuration
MODEL_NAME = "osmosis-ai/Osmosis-Structure-0.6B"
# Global variables to store the model and tokenizer
model = None
tokenizer = None
def load_model():
"""Load the Osmosis Structure model and tokenizer"""
global model, tokenizer
try:
print("Loading Osmosis Structure model...")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto" if torch.cuda.is_available() else None,
trust_remote_code=True
)
print("βœ… Osmosis Structure model loaded successfully!")
return True
except Exception as e:
print(f"❌ Error loading model: {e}")
return False
@spaces.GPU
def text_to_json(input_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=20):
"""Convert plain text to structured JSON using Osmosis Structure model"""
global model, tokenizer
if model is None or tokenizer is None:
return "❌ Model not loaded. Please wait for model initialization."
try:
# Create a structured prompt for JSON conversion
messages = [
{
"role": "system",
"content": "You are a helpful assistant that converts unstructured text into well-formatted JSON. Extract key information and organize it into a logical, structured format. Always respond with valid JSON."
},
{
"role": "user",
"content": f"Convert this text to JSON format:\n\n{input_text}"
}
]
# Apply chat template
formatted_prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Tokenize the input
inputs = tokenizer(
formatted_prompt,
return_tensors="pt",
truncation=True,
max_length=2048
)
# Move to device if using GPU
if torch.cuda.is_available():
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# Generation parameters based on model config
generation_config = {
"max_new_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"top_k": top_k,
"do_sample": True,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": tokenizer.eos_token_id,
"repetition_penalty": 1.1,
}
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
**generation_config
)
# Decode the response
generated_tokens = outputs[0][len(inputs["input_ids"][0]):]
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
# Clean up the response
generated_text = generated_text.strip()
# Try to extract JSON from the response
json_start = generated_text.find('{')
json_end = generated_text.rfind('}')
if json_start != -1 and json_end != -1 and json_end > json_start:
json_text = generated_text[json_start:json_end+1]
else:
# If no clear JSON boundaries, try to clean the whole response
json_text = generated_text
# Remove common prefixes
prefixes_to_remove = ["```json", "```", "Here's the JSON:", "JSON:", "```json\n"]
for prefix in prefixes_to_remove:
if json_text.startswith(prefix):
json_text = json_text[len(prefix):].strip()
# Remove common suffixes
suffixes_to_remove = ["```", "\n```"]
for suffix in suffixes_to_remove:
if json_text.endswith(suffix):
json_text = json_text[:-len(suffix)].strip()
# Validate and format JSON
try:
parsed_json = json.loads(json_text)
return json.dumps(parsed_json, indent=2, ensure_ascii=False)
except json.JSONDecodeError:
# If still not valid JSON, return the cleaned text with a note
return f"Generated response (may need manual cleanup):\n\n{json_text}"
except Exception as e:
return f"❌ Error generating JSON: {str(e)}"
# Create Gradio interface
def create_demo():
with gr.Blocks(
title="Osmosis Structure - Text to JSON Converter",
theme=gr.themes.Soft()
) as demo:
gr.Markdown("""
# 🌊 Osmosis Structure - Text to JSON Converter
Convert unstructured text into well-formatted JSON using the Osmosis Structure 0.6B model.
This model is specifically trained for structured data extraction and format conversion.
""")
gr.Markdown("""
### ℹ️ About Osmosis Structure
- **Model**: Osmosis Structure 0.6B parameters
- **Architecture**: Qwen3 (specialized for structured data)
- **Purpose**: Converting unstructured text to structured JSON format
- **Optimizations**: Fine-tuned for data extraction and format conversion tasks
The model automatically identifies key information in your text and organizes it into logical JSON structures.
""")
with gr.Row():
with gr.Column(scale=1):
input_text = gr.Textbox(
label="πŸ“ Input Text",
placeholder="Enter your unstructured text here...\n\nExample: 'John Smith is a 30-year-old software engineer from New York. He works at Tech Corp and has 5 years of experience in Python development.'",
lines=8,
max_lines=15
)
with gr.Accordion("βš™οΈ Generation Settings", open=False):
max_tokens = gr.Slider(
minimum=50,
maximum=1000,
value=512,
step=10,
label="Max Tokens",
info="Maximum number of tokens to generate"
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.6,
step=0.1,
label="Temperature",
info="Controls randomness (lower = more focused)"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p",
info="Nucleus sampling parameter"
)
top_k = gr.Slider(
minimum=1,
maximum=100,
value=20,
step=1,
label="Top-k",
info="Limits vocabulary for generation"
)
convert_btn = gr.Button(
"πŸ”„ Convert to JSON",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
output_json = gr.Textbox(
label="πŸ“‹ Generated JSON",
lines=15,
max_lines=20,
interactive=False,
show_copy_button=True
)
# Example inputs
gr.Markdown("### πŸ“š Example Inputs")
examples = gr.Examples(
examples=[
["John Smith is a 30-year-old software engineer from New York. He works at Tech Corp and has 5 years of experience in Python development. His email is [email protected] and he graduated from MIT in 2018."],
["Order #12345 was placed on March 15, 2024. Customer: Sarah Johnson, Address: 123 Main St, Boston MA 02101. Items: 2x Laptop ($999 each), 1x Mouse ($25). Total: $2023. Status: Shipped via FedEx, tracking: 1234567890."],
["The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact [email protected] for questions."],
["Product: Wireless Headphones Model XYZ-100. Price: $199.99. Features: Bluetooth 5.0, 30-hour battery, noise cancellation, wireless charging case. Colors available: Black, White, Blue. Warranty: 2 years. Rating: 4.5/5 stars (324 reviews)."]
],
inputs=input_text,
label="Click on any example to try it"
)
# Event handlers
convert_btn.click(
fn=text_to_json,
inputs=[input_text, max_tokens, temperature, top_p, top_k],
outputs=output_json,
show_progress=True
)
# Allow Enter key to trigger conversion
input_text.submit(
fn=text_to_json,
inputs=[input_text, max_tokens, temperature, top_p, top_k],
outputs=output_json,
show_progress=True
)
return demo
# Initialize the demo
if __name__ == "__main__":
print("🌊 Initializing Osmosis Structure Demo...")
# Load model at startup
if load_model():
print("πŸš€ Creating Gradio interface...")
demo = create_demo()
demo.launch(
share=True,
show_error=True,
show_tips=True,
enable_queue=True,
ssr_mode=False,
mcp_server=True
)
else:
print("❌ Failed to load model. Please check your setup.")