zhangchenxu commited on
Commit
fbc8f04
·
1 Parent(s): 8459fca
Files changed (1) hide show
  1. app.py +133 -38
app.py CHANGED
@@ -1,9 +1,11 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
 
3
 
4
- # Initialize the client
5
  client = InferenceClient("zhangchenxu/TinyV-1.5B")
6
 
 
7
  LV_PROMPT = """
8
  You are an AI tasked with identifying false negatives in answer verification. A false negative occurs when a model's answer is essentially correct but is marked as incorrect due to minor discrepancies or formatting issues. Your job is to analyze the given question, ground truth answer, and model answer to determine if the model's answer is actually correct despite appearing different from the ground truth.
9
 
@@ -25,7 +27,7 @@ EXAMPLES = [
25
  "model_answer": "-10, -8, -6",
26
  "temp": 0.3,
27
  "top_p": 0.95,
28
- "tokens": 2
29
  },
30
  {
31
  "name": "Latex Expression",
@@ -34,7 +36,7 @@ EXAMPLES = [
34
  "model_answer": "4/7",
35
  "temp": 0.3,
36
  "top_p": 0.95,
37
- "tokens": 2
38
  },
39
  {
40
  "name": "Variable Labeling",
@@ -43,7 +45,7 @@ EXAMPLES = [
43
  "model_answer": "b=-3, c=0",
44
  "temp": 0.3,
45
  "top_p": 0.95,
46
- "tokens": 2
47
  },
48
  {
49
  "name": "Paraphrase",
@@ -52,7 +54,7 @@ EXAMPLES = [
52
  "model_answer": "Yes, Peter can guarantee finding 5 genuine coins while ensuring that none of these genuine coins are paid to Vasya.",
53
  "temp": 0.3,
54
  "top_p": 0.95,
55
- "tokens": 2
56
  },
57
  {
58
  "name": "False Example",
@@ -61,19 +63,27 @@ EXAMPLES = [
61
  "model_answer": "K2 is the tallest mountain on Earth.",
62
  "temp": 0.3,
63
  "top_p": 0.95,
64
- "tokens": 2
65
  }
66
  ]
67
 
 
68
  def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_tokens):
 
69
  prompt = LV_PROMPT.format(
70
  question=question,
71
  ground_truth=ground_truth,
72
  model_answer=model_answer
73
  )
 
 
74
  messages = [{"role": "user", "content": prompt}]
 
 
75
  response_text = ""
 
76
  try:
 
77
  for message in client.chat_completion(
78
  messages,
79
  max_tokens=max_tokens,
@@ -88,6 +98,7 @@ def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_
88
  except Exception as e:
89
  yield f"Error: {str(e)}"
90
 
 
91
  def load_example(example_index):
92
  example = EXAMPLES[example_index]
93
  return (
@@ -99,44 +110,128 @@ def load_example(example_index):
99
  example["tokens"]
100
  )
101
 
102
- with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
103
- gr.Markdown("## 🧠 TinyV - Answer Verification Tool\nThis tool verifies model-generated answers for correctness.")
104
-
105
- # ✅ Define sliders first so they can be referenced later
106
- temperature = gr.Slider(0, 1, value=0.3, step=0.1, label="Temperature", visible=False)
107
- top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p", visible=False)
108
- max_tokens = gr.Slider(1, 128, value=2, step=1, label="Max Tokens", visible=False)
109
-
110
- with gr.Row():
111
- with gr.Column(scale=1):
112
- question = gr.Textbox(lines=3, label="📘 Question")
113
- ground_truth = gr.Textbox(lines=3, label="✅ Ground Truth Answer")
114
- model_answer = gr.Textbox(lines=3, label="🤖 Model Answer")
115
-
116
- gr.Markdown("### 🔍 Try Examples:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  with gr.Row():
118
- for i, ex in enumerate(EXAMPLES):
119
- btn = gr.Button(ex["name"], size="sm")
120
- btn.click(
121
- fn=lambda idx=i: load_example(idx),
122
- outputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens]
123
- )
124
-
125
- with gr.Column(scale=1):
126
- with gr.Accordion("⚙️ Advanced Settings", open=False):
127
- temperature.visible = True
128
- top_p.visible = True
129
- max_tokens.visible = True
130
-
131
- verify_btn = gr.Button("✅ Verify Answer", variant="primary")
132
- result = gr.Textbox(label="🧾 Verification Result", lines=5, placeholder="Result will appear here...")
133
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  verify_btn.click(
135
  fn=verify_answer,
136
  inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
137
  outputs=result
138
  )
139
 
 
140
  demo.queue()
 
 
141
  if __name__ == "__main__":
142
- demo.launch()
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
+ import time
4
 
5
+ # Initialize the client with your model
6
  client = InferenceClient("zhangchenxu/TinyV-1.5B")
7
 
8
+ # The prompt template for the LLM verifier
9
  LV_PROMPT = """
10
  You are an AI tasked with identifying false negatives in answer verification. A false negative occurs when a model's answer is essentially correct but is marked as incorrect due to minor discrepancies or formatting issues. Your job is to analyze the given question, ground truth answer, and model answer to determine if the model's answer is actually correct despite appearing different from the ground truth.
11
 
 
27
  "model_answer": "-10, -8, -6",
28
  "temp": 0.3,
29
  "top_p": 0.95,
30
+ "tokens": 1
31
  },
32
  {
33
  "name": "Latex Expression",
 
36
  "model_answer": "4/7",
37
  "temp": 0.3,
38
  "top_p": 0.95,
39
+ "tokens": 1
40
  },
41
  {
42
  "name": "Variable Labeling",
 
45
  "model_answer": "b=-3, c=0",
46
  "temp": 0.3,
47
  "top_p": 0.95,
48
+ "tokens": 1
49
  },
50
  {
51
  "name": "Paraphrase",
 
54
  "model_answer": "Yes, Peter can guarantee finding 5 genuine coins while ensuring that none of these genuine coins are paid to Vasya.",
55
  "temp": 0.3,
56
  "top_p": 0.95,
57
+ "tokens": 1
58
  },
59
  {
60
  "name": "False Example",
 
63
  "model_answer": "K2 is the tallest mountain on Earth.",
64
  "temp": 0.3,
65
  "top_p": 0.95,
66
+ "tokens": 1
67
  }
68
  ]
69
 
70
+ # Verification function
71
  def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_tokens):
72
+ # Format the prompt with user inputs
73
  prompt = LV_PROMPT.format(
74
  question=question,
75
  ground_truth=ground_truth,
76
  model_answer=model_answer
77
  )
78
+
79
+ # Prepare messages for the API
80
  messages = [{"role": "user", "content": prompt}]
81
+
82
+ # Initialize response
83
  response_text = ""
84
+
85
  try:
86
+ # Stream the response for better UX
87
  for message in client.chat_completion(
88
  messages,
89
  max_tokens=max_tokens,
 
98
  except Exception as e:
99
  yield f"Error: {str(e)}"
100
 
101
+ # Function to load an example
102
  def load_example(example_index):
103
  example = EXAMPLES[example_index]
104
  return (
 
110
  example["tokens"]
111
  )
112
 
113
+ # Create the Gradio interface
114
+ with gr.Blocks(
115
+ theme=gr.themes.Soft(
116
+ primary_hue="blue",
117
+ font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"]
118
+ ),
119
+ css="""
120
+ .container { max-width: 1000px; margin: auto; }
121
+ .example-btn { min-width: 140px; }
122
+ .title { text-align: center; margin-bottom: 1rem; }
123
+ .result-box { min-height: 100px; }
124
+ """
125
+ ) as demo:
126
+ # Header
127
+ with gr.Group(elem_classes="container"):
128
+ gr.Markdown(
129
+ """
130
+ # 🧠 TinyV - Answer Verification Tool
131
+
132
+ This tool verifies if a model-generated answer is semantically correct compared to a ground truth answer using a fine-tuned LLM.
133
+ """,
134
+ elem_classes="title"
135
+ )
136
+
137
+ # Main input area
138
+ with gr.Row(equal_height=True):
139
+ # Left column - Inputs
140
+ with gr.Column():
141
+ question = gr.Textbox(
142
+ lines=3,
143
+ label="📘 Question",
144
+ placeholder="Enter the question here..."
145
+ )
146
+ ground_truth = gr.Textbox(
147
+ lines=3,
148
+ label="✅ Ground Truth Answer",
149
+ placeholder="Enter the correct answer here..."
150
+ )
151
+ model_answer = gr.Textbox(
152
+ lines=3,
153
+ label="🤖 Model Answer",
154
+ placeholder="Enter the answer to verify here..."
155
+ )
156
+
157
+ # Right column - Result and verification button
158
+ with gr.Column():
159
+ verify_btn = gr.Button("✅ Verify Answer", variant="primary", size="lg")
160
+ result = gr.Textbox(
161
+ label="🧾 Verification Result",
162
+ placeholder="Result will appear here...",
163
+ lines=9,
164
+ elem_classes="result-box"
165
+ )
166
+
167
+ # Examples section
168
+ gr.Markdown("### 🔍 Try Examples:")
169
+ with gr.Row() as example_row:
170
+ for i, ex in enumerate(EXAMPLES):
171
+ btn = gr.Button(ex["name"], size="sm", elem_classes="example-btn")
172
+ btn.click(
173
+ fn=lambda idx=i: load_example(idx),
174
+ outputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens]
175
+ )
176
+ # Also run verification when example is loaded
177
+ btn.click(
178
+ fn=verify_answer,
179
+ inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
180
+ outputs=result,
181
+ queue=False
182
+ )
183
+
184
+ # Advanced Settings (hidden at the bottom)
185
+ with gr.Accordion("⚙️ Advanced Settings", open=False):
186
  with gr.Row():
187
+ temperature = gr.Slider(0, 1, value=0.3, step=0.1, label="Temperature")
188
+ top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
189
+ max_tokens = gr.Slider(1, 128, value=1, step=1, label="Max Tokens")
190
+
191
+ # About section
192
+ with gr.Accordion("ℹ️ About This Tool", open=False):
193
+ gr.Markdown(
194
+ """
195
+ ### What This Tool Does
196
+
197
+ This verification tool uses the TinyV-1.5B model to determine if answers are semantically equivalent,
198
+ even when they have different:
199
+
200
+ - **Formatting** (LaTeX vs. plain text, spacing, etc.)
201
+ - **Ordering** (e.g., listing items in different orders)
202
+ - **Phrasing** (paraphrases with the same meaning)
203
+ - **Variable labeling** (with or without variable names)
204
+
205
+ ### API Usage
206
+
207
+ ```python
208
+ from gradio_client import Client
209
+
210
+ client = Client("zhangchenxu/TinyV")
211
+ result = client.predict(
212
+ question="What is the capital of France?",
213
+ ground_truth="The capital of France is Paris.",
214
+ model_answer="Paris is the capital of France.",
215
+ temperature=0.3,
216
+ top_p=0.95,
217
+ max_tokens=2,
218
+ api_name="/verify_answer"
219
+ )
220
+ print(result)
221
+ ```
222
+ """
223
+ )
224
+
225
+ # Connect the interface to the verification function
226
  verify_btn.click(
227
  fn=verify_answer,
228
  inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
229
  outputs=result
230
  )
231
 
232
+ # Define the public API
233
  demo.queue()
234
+
235
+ # Launch the app
236
  if __name__ == "__main__":
237
+ demo.launch()