Spaces:

oberbics
/

HistorySpace

Sleeping

App Files Files Community

oberbics commited on Apr 13

Commit

39ee1aa

verified ·

1 Parent(s): be096d1

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -122

app.py CHANGED Viewed

@@ -1,161 +1,106 @@
 import gradio as gr
 import torch
 import json
-import re
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from itertools import cycle
-from urllib.parse import unquote
-# Load model
-model_name = "numind/NuExtract-1.5"
-try:
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        device_map="auto",
-        torch_dtype=torch.float16,
-        trust_remote_code=True
-    )
-    MODEL_LOADED = True
-except Exception as e:
-    MODEL_LOADED = False
-    print(f"Model loading failed: {e}")
-# Extract leaf values from JSON (simplified)
-def extract_leaves(json_data):
-    leaves = []
-    def _extract(data, path=None):
-        if path is None:
-            path = []
-        if isinstance(data, dict):
-            for key, value in data.items():
-                new_path = path + [key]
-                if isinstance(value, (dict, list)):
-                    _extract(value, new_path)
-                elif value and isinstance(value, str) and len(value.strip()) > 0:
-                    leaves.append((new_path, value))
-        elif isinstance(data, list):
-            for i, item in enumerate(data):
-                new_path = path + [i]
-                if isinstance(item, (dict, list)):
-                    _extract(item, new_path)
-                elif item and isinstance(item, str) and len(item.strip()) > 0:
-                    leaves.append((new_path, item))
-    _extract(json_data)
-    return leaves
-# Highlight words in text
-def highlight_words(input_text, json_output):
-    colors = cycle(["#90ee90", "#add8e6", "#ffb6c1", "#ffff99", "#ffa07a"])
-    color_map = {}
-    highlighted_text = input_text
-    leaves = extract_leaves(json_output)
-    for path, value in leaves:
-        path_key = tuple(path)
-        if path_key not in color_map:
-            color_map[path_key] = next(colors)
-        color = color_map[path_key]
-        try:
-            escaped_value = re.escape(value).replace(r'\ ', r'\s+')
-            pattern = rf"(?<=[ \n\t]){escaped_value}(?=[ \n\t\.\,\?\:\;])"
-            replacement = f"<span style='background-color: {color};'>{unquote(value)}</span>"
-            highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
-        except:
-            # Skip highlighting if regex fails
-            pass
-    return highlighted_text
-# Process function
-def extract_structure(template, text, size="4000"):
-    if not MODEL_LOADED:
-        return "❌ Model not loaded", "{}", "<p style='color:red'>Model failed to initialize</p>"
     try:
-        # Get window size
-        window_size = 4000
-        if isinstance(size, str) and size.isdigit():
-            window_size = min(int(size), 10000)  # Cap at 10k
-        # Format the input (simplified version without sliding window)
         prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
-        # Generate prediction
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
         outputs = model.generate(
-            **inputs,
-            max_new_tokens=2000,  # Reduced for testing
             do_sample=False
         )
         result = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract JSON result
         if "<|output|>" in result:
             json_text = result.split("<|output|>")[1].strip()
         else:
-            json_text = result.strip()
-        # Try to parse and format JSON
-        json_data = json.loads(json_text)
-        formatted_json = json.dumps(json_data, indent=2)
-        # Create highlighted version
-        html_content = highlight_words(text, json_data)
-        return "✅ Success", formatted_json, html_content
     except Exception as e:
-        return f"❌ Error: {str(e)}", "{}", f"<p style='color:red'>{str(e)}</p>"
-# Create interface
 with gr.Blocks() as demo:
-    gr.Markdown("# NuExtract-1.5 Structured Data Extractor")
     with gr.Row():
         with gr.Column():
             template = gr.Textbox(
-                label="Template (JSON)",
                 value='{"name": "", "email": ""}',
                 lines=5
             )
-            text = gr.TextArea(
-                label="Input Text",
                 value="Contact: John Smith ([email protected])",
-                lines=10
             )
-            size = gr.Textbox(
-                label="Window Size",
-                value="4000",
-                visible=True
-            )
-            btn = gr.Button("Extract", variant="primary")
         with gr.Column():
             status = gr.Textbox(label="Status")
-            json_out = gr.Textbox(label="Extracted JSON", lines=10)
-            html_out = gr.HTML(label="Highlighted Text")
-    # Connect the button
-    btn.click(
-        fn=extract_structure,
-        inputs=[template, text, size],
-        outputs=[status, json_out, html_out]
     )
-    # Add examples that match format
-    gr.Examples(
-        [
-            [
-                '{"name": "", "email": ""}',
-                'Contact: John Smith ([email protected])',
-                "4000"
-            ]
-        ],
-        [template, text, size]
     )
-demo.launch()

 import gradio as gr
 import torch
 import json
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# Simple test function to debug button clicks
+def test_function(template, text):
+    print(f"Function called with template: {template[:30]} and text: {text[:30]}")
+    return "Button clicked successfully", "Function was called"
+# Real extraction function
+def extract_info(template, text):
     try:
+        # Format prompt according to NuExtract-1.5 requirements
         prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
+        # Tokenize
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        # Generate
+        print("Generating output...")
         outputs = model.generate(
+            **inputs,
+            max_new_tokens=1000,
             do_sample=False
         )
+        # Decode and extract result
+        print("Decoding output...")
         result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Split at output marker
         if "<|output|>" in result:
             json_text = result.split("<|output|>")[1].strip()
         else:
+            json_text = result
+        # Try to parse as JSON
+        print("Parsing JSON...")
+        extracted = json.loads(json_text)
+        formatted = json.dumps(extracted, indent=2)
+        return "✅ Success", formatted
     except Exception as e:
+        print(f"Error: {str(e)}")
+        return f"❌ Error: {str(e)}", "{}"
+# Load model
+try:
+    print("Loading model...")
+    model_name = "numind/NuExtract-1.5"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    print("Model loaded successfully")
+except Exception as e:
+    print(f"Model loading error: {e}")
+    # Create dummy function for testing UI
+    def extract_info(template, text):
+        return "Model failed to load", "Cannot process request"
+# Create a very simple interface
 with gr.Blocks() as demo:
+    gr.Markdown("# NuExtract-1.5 Extraction Tool")
     with gr.Row():
         with gr.Column():
             template = gr.Textbox(
+                label="JSON Template",
                 value='{"name": "", "email": ""}',
                 lines=5
             )
+            text = gr.Textbox(
+                label="Text to Extract From",
                 value="Contact: John Smith ([email protected])",
+                lines=8
             )
+            # Two buttons for testing
+            test_btn = gr.Button("Test Click")
+            extract_btn = gr.Button("Extract Information", variant="primary")
         with gr.Column():
             status = gr.Textbox(label="Status")
+            output = gr.Textbox(label="Output", lines=10)
+    # Connect both buttons to verify functionality
+    test_btn.click(
+        fn=test_function,
+        inputs=[template, text],
+        outputs=[status, output]
     )
+    extract_btn.click(
+        fn=extract_info,
+        inputs=[template, text],
+        outputs=[status, output]
     )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()