Spaces:

oberbics
/

HistorySpace

Running on Zero

App Files Files Community

oberbics commited on Apr 13

Commit

f764538

verified ·

1 Parent(s): 5f7fbe7

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -173

app.py CHANGED Viewed

@@ -1,181 +1,85 @@
-import os
-import re
-import time
 import json
-from itertools import cycle
 import torch
 import gradio as gr
-from urllib.parse import unquote
-from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
-from data import extract_leaves, split_document, handle_broken_output, clean_json_text, sync_empty_fields
-from examples import examples as input_examples
-from nuextract_logging import log_event
-MAX_INPUT_SIZE = 100_000
-MAX_NEW_TOKENS = 4_000
-MAX_WINDOW_SIZE = 10_000
-markdown_description = """
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-</head>
-<body>
-    <img src="https://cdn.prod.website-files.com/638364a4e52e440048a9529c/64188f405afcf42d0b85b926_logo_numind_final.png" alt="NuMind Logo" style="vertical-align: middle;width: 200px; height: 50px;">
-    <br>
-    <ul>
-        <li>NuMind is a startup developing custom information extraction solutions.</li>
-        <li>NuExtract is a zero-shot model. See the blog posts for more info (<a href="https://numind.ai/blog/nuextract-a-foundation-model-for-structured-extraction">NuExtract</a>, <a href="https://numind.ai/blog/nuextract-1-5---multilingual-infinite-context-still-small-and-better-than-gpt-4o">NuExtract-v1.5</a>).</li>
-        <li>We have started to deploy NuMind Enterprise to customize, serve, and monitor NuExtract privately. If that interests you, let's chat 😊.</li>
-        <li><strong>Website</strong>: <a href="https://www.numind.ai/">https://www.numind.ai/</a></li>
-    </ul>
-    <h1>NuExtract-v1.5</h1>
-    <p>NuExtract-v1.5 is a fine-tuning of Phi-3.5-mini-instruct, trained on a private high-quality dataset for structured information extraction.
-    It supports long documents and several languages (English, French, Spanish, German, Portuguese, and Italian).
-    To use the model, provide an input text and a JSON template describing the information you need to extract.</p>
-    <ul>
-        <li><strong>Model</strong>: <a href="https://huggingface.co/numind/NuExtract-v1.5">numind/NuExtract-v1.5</a></li>
-    </ul>
-    <i>⚠️ In this space we restrict the model inputs to a maximum length of 10k tokens, with anything over 4k being processed in a sliding window. For full model performance, self-host the model or contact us.</i>
-    <br>
-    <i>⚠️ The model is trained to assume a valid JSON template. Attempts to use invalid JSON could lead to unpredictable results.</i>
-</body>
-</html>
-"""
-def highlight_words(input_text, json_output):
-    colors = cycle(["#90ee90", "#add8e6", "#ffb6c1", "#ffff99", "#ffa07a", "#20b2aa", "#87cefa", "#b0e0e6", "#dda0dd", "#ffdead"])
-    color_map = {}
-    highlighted_text = input_text
-    leaves = extract_leaves(json_output)
-    for path, value in leaves:
-        path_key = tuple(path)
-        if path_key not in color_map:
-            color_map[path_key] = next(colors)
-        color = color_map[path_key]
-        escaped_value = re.escape(value).replace(r'\ ', r'\s+') # escape value and replace spaces with \s+
-        pattern = rf"(?<=[ \n\t]){escaped_value}(?=[ \n\t\.\,\?\:\;])"
-        replacement = f"<span style='background-color: {color};'>{unquote(value)}</span>"
-        highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
-    return highlighted_text
-def predict_chunk(text, template, current, model, tokenizer):
-    current = clean_json_text(current)
-    input_llm =  f"<|input|>\n### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n\n<|output|>" + "{"
-    input_ids = tokenizer(input_llm, return_tensors="pt", truncation=True, max_length=MAX_INPUT_SIZE).to("cuda")
-    output = tokenizer.decode(model.generate(**input_ids, max_new_tokens=MAX_NEW_TOKENS, do_sample=False, use_cache=False)[0], skip_special_tokens=True)
-    print(output)
-    return clean_json_text(output.split("<|output|>")[1])
-def sliding_window_prediction(template, text, model, tokenizer, window_size=4000, overlap=128):
-    # Split text into chunks of n tokens
-    tokens = tokenizer.tokenize(text)
-    chunks = split_document(text, window_size, overlap, tokenizer)
-    # Iterate over text chunks
-    prev = template
-    full_pred = ""
-    for i, chunk in enumerate(chunks):
-        print(f"Processing chunk {i}...")
-        pred = predict_chunk(chunk, template, prev, model, tokenizer)
-        # Handle broken output
-        pred = handle_broken_output(pred, prev)
-        # create highlighted text
-        try:
-            highlighted_pred = highlight_words(text, json.loads(pred))
-        except:
-            highlighted_pred = text
-        # attempt json parsing
-        template_dict = None
-        pred_dict = None
-        try:
-            template_dict = json.loads(template)
-        except:
-            pass
-        try:
-            pred_dict = json.loads(pred)
-        except:
-            pass
-        # Sync empty fields
-        if template_dict and pred_dict:
-            synced_pred = sync_empty_fields(pred_dict, template_dict)
-            synced_pred = json.dumps(synced_pred, indent=4, ensure_ascii=False)
-        elif pred_dict:
-            synced_pred = json.dumps(pred_dict, indent=4, ensure_ascii=False)
-        else:
-            synced_pred = pred
-        # Return progress, current prediction, and updated HTML
-        yield f"Processed chunk {i+1}/{len(chunks)}", synced_pred, highlighted_pred
-        # Iterate
-        prev = pred
-######
-# Load the model and tokenizer
-model_name = "numind/NuExtract-v1.5"
-auth_token = os.environ.get("HF_TOKEN") or False
-model = AutoModelForCausalLM.from_pretrained(model_name,
-                                             trust_remote_code=True,
-                                             torch_dtype=torch.bfloat16,
-                                             device_map="auto", use_auth_token=auth_token)
-tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=auth_token)
-model.eval()
-def gradio_interface_function(template, text, size, is_example):
-    if len(tokenizer.tokenize(text)) > MAX_INPUT_SIZE:
-        yield "", "Input text too long for space. Download model to use unrestricted.", ""
-        return  # End the function since there was an error
-    # Initialize the sliding window prediction process
-    # Check if size is a boolean (from examples) and use a default if it is
-    if isinstance(size, bool) or size == 'True':
-        window_size = 4000  # Use default window size for examples
-    else:
-        window_size = int(size)
-    prediction_generator = sliding_window_prediction(template, text, model, tokenizer, window_size=window_size)
-    # Iterate over the generator to return values at each step
-    for progress, full_pred, html_content in prediction_generator:
-        yield progress, full_pred, html_content
-    # Removed the logging code entirely
-# Set up the Gradio interface
-iface = gr.Interface(
-    description=markdown_description,
-    fn=gradio_interface_function,
-    inputs=[
-        gr.Textbox(lines=2, placeholder="Enter Template here...", label="Template"),
-        gr.Textbox(lines=2, placeholder="Enter input Text here...", label="Input Text"),
-        gr.Textbox(lines=2, placeholder="Enter windows size here...", label="Size"),
-        gr.Checkbox(label="Is Example?", visible=False),
-    ],
-    outputs=[
-        gr.Textbox(label="Progress"),
-        gr.Textbox(label="Model Output"),
-        gr.HTML(label="Model Output with Highlighted Words"),
-    ],
-    examples=input_examples,
-    # live=True  # Enable real-time updates
-)
-iface.launch(debug=True, share=True)

 import json
 import torch
 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Simplified extraction function
+def extract_structure(template, text, progress=None):
+    try:
+        # Format the input
+        prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
+        # Generate prediction
+        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
+        output = tokenizer.decode(model.generate(**input_ids, max_new_tokens=2000)[0], skip_special_tokens=True)
+        # Extract result
+        result = output.split("<|output|>")[1]
+        # Highlight found items in text (simplified)
+        highlighted = f"<p>Processed text of length {len(text)} characters</p>"
+        return "Processing complete", result, highlighted
+    except Exception as e:
+        return f"Error: {str(e)}", "{}", "<p>Processing failed</p>"
+# Load model
+model_name = "numind/NuExtract-1.5"
+try:
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.float16,  # Using float16 instead of bfloat16 for better compatibility
+        trust_remote_code=True,
+        device_map="auto"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model_loaded = True
+except Exception as e:
+    print(f"Model loading error: {e}")
+    model_loaded = False
+# Create interface
+with gr.Blocks() as demo:
+    gr.Markdown("# NuExtract-1.5 Demo")
+    if not model_loaded:
+        gr.Markdown("## ⚠️ Model failed to load. Using dummy mode.")
+    with gr.Row():
+        with gr.Column():
+            template_input = gr.Textbox(
+                label="Template (JSON)",
+                value='{"name": "", "email": ""}',
+                lines=5
+            )
+            text_input = gr.Textbox(
+                label="Input Text",
+                value="Contact: John Smith ([email protected])",
+                lines=10
+            )
+            submit_btn = gr.Button("Extract Information")
+        with gr.Column():
+            progress_output = gr.Textbox(label="Progress")
+            result_output = gr.Textbox(label="Extracted Information")
+            html_output = gr.HTML(label="Highlighted Text")
+    submit_btn.click(
+        fn=extract_structure,
+        inputs=[template_input, text_input],
+        outputs=[progress_output, result_output, html_output]
+    )
+    # Simple example
+    gr.Examples(
+        [
+            [
+                '{"name": "", "email": ""}',
+                'Contact: John Smith ([email protected])'
+            ]
+        ],
+        [template_input, text_input]
+    )
+if __name__ == "__main__":
+    demo.launch()