File size: 4,831 Bytes
23c0ee1
be196c4
23c0ee1
be196c4
095dbb9
be196c4
095dbb9
 
 
 
 
26a1605
095dbb9
26a1605
f683120
 
 
26a1605
 
23c0ee1
26a1605
095dbb9
 
f764538
 
26a1605
095dbb9
 
 
 
26a1605
095dbb9
 
 
26a1605
095dbb9
26a1605
095dbb9
23c0ee1
26a1605
095dbb9
 
23c0ee1
26a1605
 
 
 
23c0ee1
 
26a1605
095dbb9
 
 
26a1605
095dbb9
26a1605
 
 
23c0ee1
 
26a1605
f764538
26a1605
f764538
26a1605
23c0ee1
f764538
26a1605
 
095dbb9
23c0ee1
d5698f0
f764538
 
 
26a1605
f764538
 
 
 
26a1605
f764538
 
 
 
26a1605
f764538
 
 
26a1605
 
f764538
 
 
 
 
 
 
 
 
 
 
095dbb9
 
 
 
 
 
 
 
 
 
 
 
 
 
f764538
 
 
 
 
d5698f0
23c0ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import os
import json
import re
import gradio as gr
import requests

# Hugging Face API details
API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
api_token = os.environ.get("HF_TOKEN", "")  # Get token from environment variable
headers = {"Authorization": f"Bearer {api_token}"}


def query_api(payload):
    try:
        response = requests.post(API_URL, headers=headers, json=payload)
        print("API STATUS CODE:", response.status_code)
        print("RAW RESPONSE:", response.text)
        return response.json()
    except Exception as e:
        return {"error": f"Request failed: {str(e)}"}


def extract_structure(template, text):
    try:
        prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"

        payload = {
            "inputs": prompt,
            "parameters": {
                "max_new_tokens": 2000,
                "temperature": 0.01,
                "return_full_text": True
            }
        }

        response = query_api(payload)

        if isinstance(response, dict) and "error" in response:
            return f"API Error: {response['error']}", "{}", f"<p>Error: {response['error']}</p>"

        if isinstance(response, list) and len(response) > 0:
            output = response[0].get("generated_text", "")
            print("Generated Text:", output)

            if "<|output|>" in output:
                result = output.split("<|output|>")[-1].strip()
            else:
                match = re.search(r'({[\s\S]+})', output)
                result = match.group(1) if match else output.strip()

            try:
                parsed = json.loads(result)
                result = json.dumps(parsed, indent=2)
            except Exception:
                pass

            highlighted = f"<p>βœ… Successfully processed input of length {len(text)} characters.</p>"
            return "βœ… Extraction Complete", result, highlighted

        return "⚠️ Unexpected API Response", json.dumps(response, indent=2), "<p>Unexpected format.</p>"

    except Exception as e:
        return f"❌ Error: {str(e)}", "{}", f"<p>Processing failed: {str(e)}</p>"


# Gradio App
with gr.Blocks() as demo:
    gr.Markdown("# 🧠 NuExtract-1.5 Information Extractor")

    if not api_token:
        gr.Markdown("## ⚠️ No API token found. Please set `HF_TOKEN` in environment variables.")

    with gr.Row():
        with gr.Column():
            template_input = gr.Textbox(
                label="Template (JSON)",
                value='{"name": "", "email": ""}',
                lines=5
            )
            text_input = gr.Textbox(
                label="Input Text",
                value="Contact: John Smith ([email protected])",
                lines=10
            )
            submit_btn = gr.Button("Extract Information")

        with gr.Column():
            progress_output = gr.Textbox(label="Progress")
            result_output = gr.Textbox(label="Extracted Information")
            html_output = gr.HTML(label="Info")

    submit_btn.click(
        fn=extract_structure,
        inputs=[template_input, text_input],
        outputs=[progress_output, result_output, html_output]
    )

    gr.Examples(
        [
            [
                '{"name": "", "email": ""}',
                'Contact: John Smith ([email protected])'
            ],
            [
                '''{
    "Model": {
        "Name": "",
        "Number of parameters": "",
        "Architecture": []
    },
    "Usage": {
        "Use case": [],
        "License": ""
    }
}''',
                '''We introduce Mistral 7B, a 7-billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms the best open 13B model (Llama 2) across all evaluated benchmarks, and the best released 34B model (Llama 1) in reasoning, mathematics, and code generation. Our model is released under the Apache 2.0 license.'''
            ]
        ],
        [template_input, text_input]
    )


def test_api_connection():
    print("===== Application Startup =====")
    if not api_token:
        print("❌ HF_TOKEN not set. Please set your API token.")
    else:
        test_payload = {
            "inputs": "<|input|>\n### Template:\n{\"test\": \"\"}\n### Text:\nHello world\n\n<|output|>",
            "parameters": {
                "max_new_tokens": 100,
                "temperature": 0.01
            }
        }
        response = query_api(test_payload)
        if isinstance(response, list):
            print("βœ… Connection to Hugging Face API successful!")
        else:
            print("⚠️ API may not be returning expected format:", response)


if __name__ == "__main__":
    test_api_connection()
    demo.launch(debug=True)  # You can add share=True or server_name/port if needed