File size: 4,067 Bytes
be196c4
 
095dbb9
 
be196c4
095dbb9
 
 
 
 
 
 
 
 
 
 
f764538
095dbb9
f764538
be196c4
095dbb9
 
 
 
 
 
 
 
 
6248a95
095dbb9
f764538
095dbb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f764538
095dbb9
f764538
 
 
 
be196c4
095dbb9
 
f764538
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
095dbb9
f764538
 
 
 
 
095dbb9
 
 
 
 
 
 
 
 
 
 
 
 
 
f764538
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import json
import gradio as gr
import requests
import os

# Hugging Face API details
API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
api_token = os.environ.get("HF_TOKEN", "")  # Get token from environment variable

headers = {"Authorization": f"Bearer {api_token}"}

def query_api(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def extract_structure(template, text):
    try:
        # Format the input following NuExtract's format
        prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
        
        # Call the API
        payload = {
            "inputs": prompt,
            "parameters": {
                "max_new_tokens": 2000,
                "temperature": 0.01,  # Nearly deterministic as recommended
                "return_full_text": True
            }
        }
        
        response = query_api(payload)
        
        # Check for errors
        if isinstance(response, dict) and "error" in response:
            return f"API Error: {response['error']}", "{}", f"<p>Error occurred: {response['error']}</p>"
            
        # Extract result - the API returns the full text so we need to split it
        if isinstance(response, list) and len(response) > 0:
            output = response[0].get("generated_text", "")
            result = output.split("<|output|>")[1] if "<|output|>" in output else output
            
            # Try to parse as JSON to format it nicely
            try:
                parsed = json.loads(result)
                result = json.dumps(parsed, indent=2)
            except:
                pass
                
            # Create a simple highlight
            highlighted = f"<p>Successfully processed text of length {len(text)} characters</p>"
            
            return "Processing complete", result, highlighted
        else:
            return "Unexpected API response", str(response), "<p>Please check API token and try again</p>"
            
    except Exception as e:
        return f"Error: {str(e)}", "{}", f"<p>Processing failed: {str(e)}</p>"

# Create interface
with gr.Blocks() as demo:
    gr.Markdown("# NuExtract-1.5 Demo")
    
    if not api_token:
        gr.Markdown("## ⚠️ No API token found. Set HF_TOKEN in Space secrets.")
    
    with gr.Row():
        with gr.Column():
            template_input = gr.Textbox(
                label="Template (JSON)", 
                value='{"name": "", "email": ""}',
                lines=5
            )
            text_input = gr.Textbox(
                label="Input Text", 
                value="Contact: John Smith ([email protected])",
                lines=10
            )
            submit_btn = gr.Button("Extract Information")
        
        with gr.Column():
            progress_output = gr.Textbox(label="Progress")
            result_output = gr.Textbox(label="Extracted Information")
            html_output = gr.HTML(label="Highlighted Text")
    
    submit_btn.click(
        fn=extract_structure,
        inputs=[template_input, text_input],
        outputs=[progress_output, result_output, html_output]
    )

    # Examples
    gr.Examples(
        [
            [
                '{"name": "", "email": ""}',
                'Contact: John Smith ([email protected])'
            ],
            [
                '''{
    "Model": {
        "Name": "",
        "Number of parameters": "",
        "Architecture": []
    },
    "Usage": {
        "Use case": [],
        "License": ""
    }
}''',
                '''We introduce Mistral 7B, a 7-billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms the best open 13B model (Llama 2) across all evaluated benchmarks, and the best released 34B model (Llama 1) in reasoning, mathematics, and code generation. Our model is released under the Apache 2.0 license.'''
            ]
        ],
        [template_input, text_input]
    )

if __name__ == "__main__":
    demo.launch()