File size: 5,201 Bytes
be196c4
 
095dbb9
 
26a1605
be196c4
095dbb9
 
 
 
 
26a1605
d5698f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
095dbb9
26a1605
f683120
 
 
 
 
26a1605
 
f683120
26a1605
 
095dbb9
d5698f0
095dbb9
f764538
26a1605
f764538
26a1605
095dbb9
 
 
 
26a1605
095dbb9
 
 
26a1605
095dbb9
26a1605
 
095dbb9
f683120
26a1605
 
095dbb9
 
26a1605
 
 
 
 
 
 
 
 
 
 
095dbb9
 
 
26a1605
095dbb9
26a1605
 
 
095dbb9
26a1605
 
f764538
26a1605
f764538
26a1605
 
f764538
26a1605
 
095dbb9
26a1605
 
d5698f0
 
 
f764538
 
 
26a1605
f764538
 
 
 
26a1605
f764538
 
 
 
26a1605
f764538
 
 
26a1605
 
f764538
 
 
 
 
 
 
 
 
 
 
095dbb9
 
 
 
 
 
 
 
 
 
 
 
 
 
f764538
 
 
 
 
 
26a1605
d5698f0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import json
import gradio as gr
import requests
import os
import re

# Hugging Face API details
API_URL = "https://api-inference.huggingface.co/models/numind/NuExtract-1.5"
api_token = os.environ.get("HF_TOKEN", "")  # Get token from environment variable
headers = {"Authorization": f"Bearer {api_token}"}


# Test API Connection
def test_api_connection():
    try:
        # Test with a simple GET request
        response = requests.get("https://api-inference.huggingface.co/models/numind/NuExtract-1.5")

        # Check if the connection was successful
        if response.status_code == 200:
            print("βœ… Connection to Hugging Face API successful!")
        else:
            print(f"⚠️ API returned status code {response.status_code}: {response.text}")
    except requests.exceptions.RequestException as e:
        print(f"❌ Connection failed: {str(e)}")


# Make the API request
def query_api(payload):
    try:
        response = requests.post(API_URL, headers=headers, json=payload)
        # Debug logs
        print("API STATUS CODE:", response.status_code)
        print("RAW RESPONSE:", response.text)

        return response.json()
    except Exception as e:
        print("Error during API call:", e)
        return {"error": f"Could not decode JSON: {str(e)}"}


# Extract structure from the template and text
def extract_structure(template, text):
    try:
        # Format the input for NuExtract
        prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"

        payload = {
            "inputs": prompt,
            "parameters": {
                "max_new_tokens": 2000,
                "temperature": 0.01,
                "return_full_text": True
            }
        }

        response = query_api(payload)

        # Check for API error
        if isinstance(response, dict) and "error" in response:
            return f"API Error: {response['error']}", "{}", "<p>Error occurred: {response['error']}</p>"

        # Get generated text
        if isinstance(response, list) and len(response) > 0:
            output = response[0].get("generated_text", "")
            print("Generated Text:", output)  # Optional debugging

            # Try to extract after <|output|>
            if "<|output|>" in output:
                result = output.split("<|output|>")[-1].strip()
            else:
                # Try to extract JSON-like structure using regex
                json_match = re.search(r'({[\s\S]+})', output)
                result = json_match.group(1) if json_match else output.strip()

            # Attempt to format JSON nicely
            try:
                parsed = json.loads(result)
                result = json.dumps(parsed, indent=2)
            except Exception:
                pass

            highlighted = f"<p>βœ… Successfully processed input of length {len(text)} characters.</p>"
            return "βœ… Extraction Complete", result, highlighted
        else:
            return "⚠️ Unexpected API Response", json.dumps(response, indent=2), "<p>Please check the API response format.</p>"

    except Exception as e:
        return f"❌ Error: {str(e)}", "{}", f"<p>Processing failed: {str(e)}</p>"


# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# 🧠 NuExtract-1.5 Information Extractor")

    if not api_token:
        gr.Markdown("## ⚠️ No API token found. Set `HF_TOKEN` in the Space secrets.")

    # Call test connection before launching the Gradio interface
    test_api_connection()

    with gr.Row():
        with gr.Column():
            template_input = gr.Textbox(
                label="Template (JSON)",
                value='{"name": "", "email": ""}',
                lines=5
            )
            text_input = gr.Textbox(
                label="Input Text",
                value="Contact: John Smith ([email protected])",
                lines=10
            )
            submit_btn = gr.Button("Extract Information")

        with gr.Column():
            progress_output = gr.Textbox(label="Progress")
            result_output = gr.Textbox(label="Extracted Information")
            html_output = gr.HTML(label="Info")

    submit_btn.click(
        fn=extract_structure,
        inputs=[template_input, text_input],
        outputs=[progress_output, result_output, html_output]
    )

    gr.Examples(
        [
            [
                '{"name": "", "email": ""}',
                'Contact: John Smith ([email protected])'
            ],
            [
                '''{
    "Model": {
        "Name": "",
        "Number of parameters": "",
        "Architecture": []
    },
    "Usage": {
        "Use case": [],
        "License": ""
    }
}''',
                '''We introduce Mistral 7B, a 7-billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms the best open 13B model (Llama 2) across all evaluated benchmarks, and the best released 34B model (Llama 1) in reasoning, mathematics, and code generation. Our model is released under the Apache 2.0 license.'''
            ]
        ],
        [template_input, text_input]
    )

if __name__ == "__main__":
    demo.launch()