oberbics commited on
Commit
39ee1aa
·
verified ·
1 Parent(s): be096d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -122
app.py CHANGED
@@ -1,161 +1,106 @@
1
  import gradio as gr
2
  import torch
3
  import json
4
- import re
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
- from itertools import cycle
7
- from urllib.parse import unquote
8
 
9
- # Load model
10
- model_name = "numind/NuExtract-1.5"
11
- try:
12
- tokenizer = AutoTokenizer.from_pretrained(model_name)
13
- model = AutoModelForCausalLM.from_pretrained(
14
- model_name,
15
- device_map="auto",
16
- torch_dtype=torch.float16,
17
- trust_remote_code=True
18
- )
19
- MODEL_LOADED = True
20
- except Exception as e:
21
- MODEL_LOADED = False
22
- print(f"Model loading failed: {e}")
23
-
24
- # Extract leaf values from JSON (simplified)
25
- def extract_leaves(json_data):
26
- leaves = []
27
-
28
- def _extract(data, path=None):
29
- if path is None:
30
- path = []
31
-
32
- if isinstance(data, dict):
33
- for key, value in data.items():
34
- new_path = path + [key]
35
- if isinstance(value, (dict, list)):
36
- _extract(value, new_path)
37
- elif value and isinstance(value, str) and len(value.strip()) > 0:
38
- leaves.append((new_path, value))
39
- elif isinstance(data, list):
40
- for i, item in enumerate(data):
41
- new_path = path + [i]
42
- if isinstance(item, (dict, list)):
43
- _extract(item, new_path)
44
- elif item and isinstance(item, str) and len(item.strip()) > 0:
45
- leaves.append((new_path, item))
46
-
47
- _extract(json_data)
48
- return leaves
49
-
50
- # Highlight words in text
51
- def highlight_words(input_text, json_output):
52
- colors = cycle(["#90ee90", "#add8e6", "#ffb6c1", "#ffff99", "#ffa07a"])
53
- color_map = {}
54
- highlighted_text = input_text
55
-
56
- leaves = extract_leaves(json_output)
57
- for path, value in leaves:
58
- path_key = tuple(path)
59
- if path_key not in color_map:
60
- color_map[path_key] = next(colors)
61
- color = color_map[path_key]
62
-
63
- try:
64
- escaped_value = re.escape(value).replace(r'\ ', r'\s+')
65
- pattern = rf"(?<=[ \n\t]){escaped_value}(?=[ \n\t\.\,\?\:\;])"
66
- replacement = f"<span style='background-color: {color};'>{unquote(value)}</span>"
67
- highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
68
- except:
69
- # Skip highlighting if regex fails
70
- pass
71
-
72
- return highlighted_text
73
 
74
- # Process function
75
- def extract_structure(template, text, size="4000"):
76
- if not MODEL_LOADED:
77
- return "❌ Model not loaded", "{}", "<p style='color:red'>Model failed to initialize</p>"
78
-
79
  try:
80
- # Get window size
81
- window_size = 4000
82
- if isinstance(size, str) and size.isdigit():
83
- window_size = min(int(size), 10000) # Cap at 10k
84
-
85
- # Format the input (simplified version without sliding window)
86
  prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
87
 
88
- # Generate prediction
89
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
 
 
 
90
  outputs = model.generate(
91
- **inputs,
92
- max_new_tokens=2000, # Reduced for testing
93
  do_sample=False
94
  )
 
 
 
95
  result = tokenizer.decode(outputs[0], skip_special_tokens=True)
96
 
97
- # Extract JSON result
98
  if "<|output|>" in result:
99
  json_text = result.split("<|output|>")[1].strip()
100
  else:
101
- json_text = result.strip()
102
-
103
- # Try to parse and format JSON
104
- json_data = json.loads(json_text)
105
- formatted_json = json.dumps(json_data, indent=2)
106
 
107
- # Create highlighted version
108
- html_content = highlight_words(text, json_data)
 
 
109
 
110
- return "✅ Success", formatted_json, html_content
111
  except Exception as e:
112
- return f"Error: {str(e)}", "{}", f"<p style='color:red'>{str(e)}</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # Create interface
115
  with gr.Blocks() as demo:
116
- gr.Markdown("# NuExtract-1.5 Structured Data Extractor")
117
 
118
  with gr.Row():
119
  with gr.Column():
120
  template = gr.Textbox(
121
- label="Template (JSON)",
122
  value='{"name": "", "email": ""}',
123
  lines=5
124
  )
125
- text = gr.TextArea(
126
- label="Input Text",
127
  value="Contact: John Smith ([email protected])",
128
- lines=10
129
  )
130
- size = gr.Textbox(
131
- label="Window Size",
132
- value="4000",
133
- visible=True
134
- )
135
- btn = gr.Button("Extract", variant="primary")
136
 
137
  with gr.Column():
138
  status = gr.Textbox(label="Status")
139
- json_out = gr.Textbox(label="Extracted JSON", lines=10)
140
- html_out = gr.HTML(label="Highlighted Text")
141
 
142
- # Connect the button
143
- btn.click(
144
- fn=extract_structure,
145
- inputs=[template, text, size],
146
- outputs=[status, json_out, html_out]
147
  )
148
 
149
- # Add examples that match format
150
- gr.Examples(
151
- [
152
- [
153
- '{"name": "", "email": ""}',
154
- 'Contact: John Smith ([email protected])',
155
- "4000"
156
- ]
157
- ],
158
- [template, text, size]
159
  )
160
 
161
- demo.launch()
 
 
 
1
  import gradio as gr
2
  import torch
3
  import json
 
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
5
 
6
+ # Simple test function to debug button clicks
7
+ def test_function(template, text):
8
+ print(f"Function called with template: {template[:30]} and text: {text[:30]}")
9
+ return "Button clicked successfully", "Function was called"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ # Real extraction function
12
+ def extract_info(template, text):
 
 
 
13
  try:
14
+ # Format prompt according to NuExtract-1.5 requirements
 
 
 
 
 
15
  prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
16
 
17
+ # Tokenize
18
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
19
+
20
+ # Generate
21
+ print("Generating output...")
22
  outputs = model.generate(
23
+ **inputs,
24
+ max_new_tokens=1000,
25
  do_sample=False
26
  )
27
+
28
+ # Decode and extract result
29
+ print("Decoding output...")
30
  result = tokenizer.decode(outputs[0], skip_special_tokens=True)
31
 
32
+ # Split at output marker
33
  if "<|output|>" in result:
34
  json_text = result.split("<|output|>")[1].strip()
35
  else:
36
+ json_text = result
 
 
 
 
37
 
38
+ # Try to parse as JSON
39
+ print("Parsing JSON...")
40
+ extracted = json.loads(json_text)
41
+ formatted = json.dumps(extracted, indent=2)
42
 
43
+ return "✅ Success", formatted
44
  except Exception as e:
45
+ print(f"Error: {str(e)}")
46
+ return f"❌ Error: {str(e)}", "{}"
47
+
48
+ # Load model
49
+ try:
50
+ print("Loading model...")
51
+ model_name = "numind/NuExtract-1.5"
52
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
53
+ model = AutoModelForCausalLM.from_pretrained(
54
+ model_name,
55
+ torch_dtype=torch.float16,
56
+ device_map="auto",
57
+ trust_remote_code=True
58
+ )
59
+ print("Model loaded successfully")
60
+ except Exception as e:
61
+ print(f"Model loading error: {e}")
62
+ # Create dummy function for testing UI
63
+ def extract_info(template, text):
64
+ return "Model failed to load", "Cannot process request"
65
 
66
+ # Create a very simple interface
67
  with gr.Blocks() as demo:
68
+ gr.Markdown("# NuExtract-1.5 Extraction Tool")
69
 
70
  with gr.Row():
71
  with gr.Column():
72
  template = gr.Textbox(
73
+ label="JSON Template",
74
  value='{"name": "", "email": ""}',
75
  lines=5
76
  )
77
+ text = gr.Textbox(
78
+ label="Text to Extract From",
79
  value="Contact: John Smith ([email protected])",
80
+ lines=8
81
  )
82
+
83
+ # Two buttons for testing
84
+ test_btn = gr.Button("Test Click")
85
+ extract_btn = gr.Button("Extract Information", variant="primary")
 
 
86
 
87
  with gr.Column():
88
  status = gr.Textbox(label="Status")
89
+ output = gr.Textbox(label="Output", lines=10)
 
90
 
91
+ # Connect both buttons to verify functionality
92
+ test_btn.click(
93
+ fn=test_function,
94
+ inputs=[template, text],
95
+ outputs=[status, output]
96
  )
97
 
98
+ extract_btn.click(
99
+ fn=extract_info,
100
+ inputs=[template, text],
101
+ outputs=[status, output]
 
 
 
 
 
 
102
  )
103
 
104
+ # Launch the app
105
+ if __name__ == "__main__":
106
+ demo.launch()