File size: 5,712 Bytes
be196c4
cede142
 
 
 
 
095dbb9
cede142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26a1605
cede142
 
 
 
 
 
 
 
26a1605
cede142
 
 
 
 
 
 
 
 
 
 
 
26a1605
cede142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26a1605
cede142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f764538
 
 
cede142
 
 
 
 
 
 
f764538
23c0ee1
cede142
23c0ee1
cede142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import gradio as gr
from transformers import pipeline
import torch
import json
import time
from functools import lru_cache

# 1. Model Loading with Health Checks
@lru_cache(maxsize=1)
def load_model():
    try:
        print("βš™οΈ Initializing NuExtract-1.5 model...")
        start_time = time.time()
        
        model = pipeline(
            "text2text-generation",
            model="numind/NuExtract-1.5",
            device="cuda" if torch.cuda.is_available() else "cpu",
            torch_dtype=torch.float16 if torch.cuda.is_available() else None
        )
        
        load_time = round(time.time() - start_time, 2)
        print(f"βœ… Model loaded successfully in {load_time}s")
        return model
    except Exception as e:
        print(f"❌ Model loading failed: {str(e)}")
        return None

# 2. Warm Start Mechanism
def keep_model_warm():
    """Periodic ping to prevent Hugging Face from unloading the model"""
    if extractor:
        try:
            extractor("ping", max_length=1)
        except:
            pass

# 3. Processing Function with Streamed Output
def extract_structure(template, text):
    # Input validation
    if not text.strip():
        yield "❌ Error: Empty input text", "", "<p style='color:red'>Please enter text to analyze</p>"
        return
    
    try:
        template_data = json.loads(template) if template.strip() else {}
    except json.JSONDecodeError:
        yield "❌ Error: Invalid JSON template", "", "<p style='color:red'>Malformed JSON template</p>"
        return

    # Processing stages
    stages = [
        ("πŸ” Initializing model...", 0.5),
        ("πŸ“– Parsing document structure...", 1.2),
        ("πŸ”„ Matching template fields...", 0.8),
        ("✨ Finalizing extraction...", 0.3)
    ]
    
    for msg, delay in stages:
        yield msg, "", ""
        time.sleep(delay)
    
    try:
        # Actual inference
        result = extractor(
            text,
            **template_data,
            max_length=512,
            num_return_sequences=1,
            temperature=0.7
        )[0]['generated_text']
        
        # Format output
        formatted_json = json.dumps(json.loads(result), indent=2)
        html_output = f"""
        <div style='
            padding: 15px;
            background: #f8f9fa;
            border-radius: 8px;
            border-left: 4px solid #4CAF50;
            margin-top: 10px;
        '>
            <h3 style='margin-top:0'>Extracted Data</h3>
            <pre style='white-space: pre-wrap'>{formatted_json}</pre>
        </div>
        """
        
        yield "βœ… Extraction complete", formatted_json, html_output
    
    except Exception as e:
        error_msg = f"❌ Processing error: {str(e)}"
        yield error_msg, "", f"<p style='color:red'>{error_msg}</p>"

# 4. Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="NuExtract 1.5") as demo:
    # Header
    gr.Markdown("""
    <div style='text-align:center'>
        <h1>🧠 NuExtract-1.5</h1>
        <p>Advanced Information Extraction System</p>
    </div>
    """)
    
    # Main layout
    with gr.Row():
        # Input Column
        with gr.Column(scale=1, min_width=400):
            gr.Markdown("### πŸ“₯ Input")
            template_input = gr.Textbox(
                label="Extraction Template (JSON)",
                value='{"fields": ["name", "email", "phone"]}',
                lines=5
            )
            text_input = gr.TextArea(
                label="Document Text",
                placeholder="John Smith ([email protected]) called regarding order #12345...",
                lines=12
            )
            gr.Examples(
                examples=[
                    [
                        '{"fields": ["name", "email"]}',
                        "Please contact Dr. Sarah Johnson at [email protected]"
                    ],
                    [
                        '{"fields": ["product", "price"]}',
                        "The new MacBook Pro costs $1,299 at our store"
                    ]
                ],
                inputs=[template_input, text_input],
                label="Try Examples:"
            )
        
        # Output Column
        with gr.Column(scale=1, min_width=500):
            gr.Markdown("### πŸ“€ Results")
            status = gr.Textbox(
                label="Status",
                value="🟒 System Ready",
                interactive=False
            )
            json_output = gr.JSON(
                label="Structured Output",
                interactive=False
            )
            html_output = gr.HTML(
                label="Formatted View",
                value="<div style='min-height:200px'></div>"
            )
    
    # Controls
    submit_btn = gr.Button("Extract Information", variant="primary")
    clear_btn = gr.Button("Clear")
    
    # Event handlers
    submit_btn.click(
        fn=extract_structure,
        inputs=[template_input, text_input],
        outputs=[status, json_output, html_output]
    )
    
    clear_btn.click(
        fn=lambda: ["", "", "", "<div></div>"],
        inputs=[],
        outputs=[template_input, text_input, json_output, html_output]
    )

# 5. Launch Configuration
if __name__ == "__main__":
    # Initialize model
    extractor = load_model()
    
    # Start keep-alive thread
    import threading
    threading.Thread(
        target=lambda: [keep_model_warm() for _ in iter(int, 1)],
        daemon=True
    ).start()
    
    # Launch app
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True,
        share=False,
        favicon_path="https://huggingface.co/favicon.ico"
    )