zhangchenxu commited on
Commit
3dddf25
·
1 Parent(s): 46a7ad8
Files changed (1) hide show
  1. app.py +160 -222
app.py CHANGED
@@ -67,6 +67,74 @@ EXAMPLES = [
67
  }
68
  ]
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # Verification function
71
  def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_tokens):
72
  if not question or not ground_truth or not model_answer:
@@ -113,246 +181,116 @@ def load_example(example_index):
113
  example["tokens"]
114
  )
115
 
116
- # Custom CSS for better styling
117
- custom_css = """
118
- .gradio-container {
119
- max-width: 1080px !important;
120
- margin: auto !important;
121
- }
122
-
123
- .main-container {
124
- background-color: #f9f9f9;
125
- border-radius: 10px;
126
- padding: 15px;
127
- box-shadow: 0 0 5px rgba(0,0,0,0.1);
128
- }
129
-
130
- .header {
131
- background: linear-gradient(90deg, #4776E6 0%, #8E54E9 100%);
132
- color: white;
133
- border-radius: 8px;
134
- padding: 20px 30px;
135
- margin-bottom: 20px;
136
- text-align: center;
137
- }
138
-
139
- .header h1 {
140
- font-size: 32px !important;
141
- font-weight: 700 !important;
142
- margin-bottom: 5px !important;
143
- }
144
-
145
- .header p {
146
- font-size: 16px !important;
147
- opacity: 0.9;
148
- }
149
-
150
- .example-btn {
151
- min-width: 140px !important;
152
- margin: 5px !important;
153
- background-color: #f0f0f0 !important;
154
- border: 1px solid #ddd !important;
155
- transition: all 0.3s ease !important;
156
- }
157
-
158
- .example-btn:hover {
159
- background-color: #e0e0e0 !important;
160
- transform: translateY(-2px) !important;
161
- box-shadow: 0 4px 6px rgba(0,0,0,0.1) !important;
162
- }
163
-
164
- .verify-btn {
165
- background: linear-gradient(90deg, #4776E6 0%, #8E54E9 100%) !important;
166
- color: white !important;
167
- padding: 12px 20px !important;
168
- font-size: 16px !important;
169
- font-weight: 600 !important;
170
- border-radius: 8px !important;
171
- transition: all 0.3s ease !important;
172
- border: none !important;
173
- width: 100% !important;
174
- margin-top: 10px !important;
175
- }
176
-
177
- .verify-btn:hover {
178
- transform: translateY(-2px) !important;
179
- box-shadow: 0 4px 10px rgba(0,0,0,0.2) !important;
180
- }
181
-
182
- .panel {
183
- background-color: white;
184
- border-radius: 8px;
185
- padding: 15px;
186
- box-shadow: 0 2px 4px rgba(0,0,0,0.05);
187
- margin-bottom: 15px;
188
- }
189
-
190
- .result-panel {
191
- background-color: #f8f9fa;
192
- border-left: 4px solid #4776E6;
193
- padding: 15px;
194
- border-radius: 8px;
195
- min-height: 120px;
196
- }
197
-
198
- .label {
199
- font-weight: 600;
200
- margin-bottom: 5px;
201
- color: #444;
202
- }
203
-
204
- .examples-container {
205
- display: flex;
206
- flex-wrap: wrap;
207
- justify-content: center;
208
- margin-top: 10px;
209
- margin-bottom: 20px;
210
- }
211
-
212
- .footer {
213
- text-align: center;
214
- font-size: 12px;
215
- color: #888;
216
- margin-top: 20px;
217
- }
218
-
219
- /* Make the accordion look better */
220
- .accordions-container .label {
221
- font-size: 14px !important;
222
- font-weight: 600 !important;
223
- }
224
- """
225
-
226
  # Create the Gradio interface with proper component initialization
227
- with gr.Blocks(
228
- theme=gr.themes.Monochrome(
229
- primary_hue="indigo",
230
- secondary_hue="blue",
231
- font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"]
232
- ),
233
- css=custom_css
234
- ) as demo:
235
 
236
  # Define states (invisible components to store values)
237
  temperature = gr.State(value=0.3)
238
  top_p = gr.State(value=0.95)
239
  max_tokens = gr.State(value=2)
240
 
241
- # Main container (using Column with elem_classes instead of Box)
242
- with gr.Column(elem_classes="main-container"):
243
- # Header
244
- with gr.Column(elem_classes="header"):
245
- gr.Markdown(
246
- """
247
- # TinyV - Answer Verification Tool
248
-
249
- Verify if model-generated answers are semantically correct compared to ground truth, even with formatting differences
250
- """
251
- )
252
 
253
- # Main content area
254
- with gr.Row(equal_height=True):
255
- # Left column - Inputs
256
- with gr.Column(scale=3):
257
- # Using Column as a panel
258
- with gr.Column(elem_classes="panel"):
259
- gr.Markdown("### Input", elem_classes="label")
260
-
261
- question = gr.Textbox(
262
- lines=3,
263
- label="Question",
264
- placeholder="Enter the mathematical problem or question here...",
265
- elem_classes="input-field"
266
- )
267
-
268
- with gr.Row():
269
- with gr.Column(scale=1):
270
- ground_truth = gr.Textbox(
271
- lines=3,
272
- label="Ground Truth Answer",
273
- placeholder="Enter the correct answer here...",
274
- elem_classes="input-field"
275
- )
276
-
277
- with gr.Column(scale=1):
278
- model_answer = gr.Textbox(
279
- lines=3,
280
- label="Model Answer",
281
- placeholder="Enter the answer to verify here...",
282
- elem_classes="input-field"
283
- )
284
-
285
- verify_btn = gr.Button("Verify Answer", elem_classes="verify-btn")
286
 
287
- # Right column - Result
288
- with gr.Column(scale=2):
289
- with gr.Column(elem_classes="panel"):
290
- gr.Markdown("### Verification Result", elem_classes="label")
291
- result = gr.Textbox(
292
- placeholder="The verification result will appear here...",
293
- lines=10,
294
- elem_classes="result-panel"
295
- )
296
-
297
- # Examples section
298
- gr.Markdown("### Examples", elem_classes="label")
299
- with gr.Column(elem_classes="examples-container"):
300
  with gr.Row():
301
- for i, ex in enumerate(EXAMPLES):
302
- btn = gr.Button(ex["name"], elem_classes="example-btn")
303
- btn.click(
304
- fn=lambda idx=i: load_example(idx),
305
- outputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens]
306
  )
307
- # Also run verification when example is loaded
308
- btn.click(
309
- fn=verify_answer,
310
- inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
311
- outputs=result
 
312
  )
 
 
313
 
314
- # Advanced Settings in accordion
315
- with gr.Accordion("Advanced Settings", open=False, elem_classes="accordions-container"):
316
- with gr.Row():
317
- temp_slider = gr.Slider(0, 1, value=0.3, step=0.1, label="Temperature")
318
- top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
319
- max_tokens_slider = gr.Slider(1, 128, value=2, step=1, label="Max Tokens")
320
-
321
- # Connect sliders to state values
322
- temp_slider.change(lambda x: x, inputs=[temp_slider], outputs=[temperature])
323
- top_p_slider.change(lambda x: x, inputs=[top_p_slider], outputs=[top_p])
324
- max_tokens_slider.change(lambda x: x, inputs=[max_tokens_slider], outputs=[max_tokens])
325
-
326
- # API usage in accordion
327
- with gr.Accordion("API Usage", open=False, elem_classes="accordions-container"):
328
- gr.Markdown(
329
- """
330
- ```python
331
- from gradio_client import Client
332
-
333
- client = Client("zhangchenxu/TinyV")
334
- result = client.predict(
335
- question="What is the capital of France?",
336
- ground_truth="The capital of France is Paris.",
337
- model_answer="Paris is the capital of France.",
338
- temperature=0.3,
339
- top_p=0.95,
340
- max_tokens=2,
341
- api_name="/verify_answer"
342
- )
343
- print(result)
344
- ```
345
- """
346
  )
347
-
348
- # Footer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  gr.Markdown(
350
  """
351
- Powered by TinyV-1.5B model. This tool verifies semantic equivalence between answers, allowing for different formatting, ordering, notation, and phrasing.
352
- """,
353
- elem_classes="footer"
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  )
355
 
 
 
 
 
 
 
 
356
  # Connect the interface to the verification function
357
  verify_btn.click(
358
  fn=verify_answer,
 
67
  }
68
  ]
69
 
70
+ import gradio as gr
71
+ from huggingface_hub import InferenceClient
72
+
73
+ # Initialize the client with the model
74
+ client = InferenceClient("zhangchenxu/TinyV-1.5B")
75
+
76
+ # The prompt template for the LLM verifier
77
+ LV_PROMPT = """
78
+ You are an AI tasked with identifying false negatives in answer verification. A false negative occurs when a model's answer is essentially correct but is marked as incorrect due to minor discrepancies or formatting issues. Your job is to analyze the given question, ground truth answer, and model answer to determine if the model's answer is actually correct despite appearing different from the ground truth.
79
+
80
+ <question>{question}</question>
81
+
82
+ <ground_truth_answer>{ground_truth}</ground_truth_answer>
83
+
84
+ <model_answer>{model_answer}</model_answer>
85
+
86
+ Return "True" if the model's answer is correct, otherwise return "False".
87
+ """
88
+
89
+ # Example sets for quick testing
90
+ EXAMPLES = [
91
+ {
92
+ "name": "Order-Insensitive",
93
+ "question": "Determine all real values of $x$ for which $(x+8)^{4}=(2 x+16)^{2}$.",
94
+ "ground_truth": "-6,-8,-10",
95
+ "model_answer": "-10, -8, -6",
96
+ "temp": 0.3,
97
+ "top_p": 0.95,
98
+ "tokens": 2
99
+ },
100
+ {
101
+ "name": "Latex Expression",
102
+ "question": "A bag contains 3 green balls, 4 red balls, and no other balls. Victor removes balls randomly from the bag, one at a time, and places them on a table. Each ball in the bag is equally likely to be chosen each time that he removes a ball. He stops removing balls when there are two balls of the same colour on the table. What is the probability that, when he stops, there is at least 1 red ball and at least 1 green ball on the table?",
103
+ "ground_truth": "$\\frac{4}{7}$",
104
+ "model_answer": "4/7",
105
+ "temp": 0.3,
106
+ "top_p": 0.95,
107
+ "tokens": 2
108
+ },
109
+ {
110
+ "name": "Variable Labeling",
111
+ "question": "If $T=x^{2}+\\frac{1}{x^{2}}$, determine the values of $b$ and $c$ so that $x^{6}+\\frac{1}{x^{6}}=T^{3}+b T+c$ for all non-zero real numbers $x$.",
112
+ "ground_truth": "-3,0",
113
+ "model_answer": "b=-3, c=0",
114
+ "temp": 0.3,
115
+ "top_p": 0.95,
116
+ "tokens": 2
117
+ },
118
+ {
119
+ "name": "Paraphrase",
120
+ "question": "Peter has 8 coins, of which he knows that 7 are genuine and weigh the same, while one is fake and differs in weight, though he does not know whether it is heavier or lighter. Peter has access to a balance scale, which shows which side is heavier but not by how much. For each weighing, Peter must pay Vasya one of his coins before the weighing. If Peter pays with a genuine coin, Vasya will provide an accurate result; if a fake coin is used, Vasya will provide a random result. Peter wants to determine 5 genuine coins and ensure that none of these genuine coins are given to Vasya. Can Peter guaranteedly achieve this?",
121
+ "ground_truth": "Petya can guarantee finding 5 genuine coins.",
122
+ "model_answer": "Yes, Peter can guarantee finding 5 genuine coins while ensuring that none of these genuine coins are paid to Vasya.",
123
+ "temp": 0.3,
124
+ "top_p": 0.95,
125
+ "tokens": 2
126
+ },
127
+ {
128
+ "name": "False Example",
129
+ "question": "What is the tallest mountain in the world?",
130
+ "ground_truth": "Mount Everest is the tallest mountain in the world.",
131
+ "model_answer": "K2 is the tallest mountain on Earth.",
132
+ "temp": 0.3,
133
+ "top_p": 0.95,
134
+ "tokens": 2
135
+ }
136
+ ]
137
+
138
  # Verification function
139
  def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_tokens):
140
  if not question or not ground_truth or not model_answer:
 
181
  example["tokens"]
182
  )
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  # Create the Gradio interface with proper component initialization
185
+ with gr.Blocks(title="TinyV - Answer Verification Tool") as demo:
 
 
 
 
 
 
 
186
 
187
  # Define states (invisible components to store values)
188
  temperature = gr.State(value=0.3)
189
  top_p = gr.State(value=0.95)
190
  max_tokens = gr.State(value=2)
191
 
192
+ # Header
193
+ gr.Markdown(
194
+ """
195
+ # TinyV - Answer Verification Tool
 
 
 
 
 
 
 
196
 
197
+ Verify if model-generated answers are semantically correct compared to ground truth, even with formatting differences
198
+ """
199
+ )
200
+
201
+ # Main content area
202
+ with gr.Row():
203
+ # Left column - Inputs
204
+ with gr.Column(scale=3):
205
+ question = gr.Textbox(
206
+ lines=3,
207
+ label="Question",
208
+ placeholder="Enter the mathematical problem or question here..."
209
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  with gr.Row():
212
+ with gr.Column():
213
+ ground_truth = gr.Textbox(
214
+ lines=3,
215
+ label="Ground Truth Answer",
216
+ placeholder="Enter the correct answer here..."
217
  )
218
+
219
+ with gr.Column():
220
+ model_answer = gr.Textbox(
221
+ lines=3,
222
+ label="Model Answer",
223
+ placeholder="Enter the answer to verify here..."
224
  )
225
+
226
+ verify_btn = gr.Button("Verify Answer", variant="primary")
227
 
228
+ # Right column - Result
229
+ with gr.Column(scale=2):
230
+ result = gr.Textbox(
231
+ label="Verification Result",
232
+ placeholder="The verification result will appear here...",
233
+ lines=9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  )
235
+
236
+ # Examples section
237
+ gr.Markdown("### Examples")
238
+ with gr.Row():
239
+ for i, ex in enumerate(EXAMPLES):
240
+ btn = gr.Button(ex["name"])
241
+ btn.click(
242
+ fn=lambda idx=i: load_example(idx),
243
+ outputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens]
244
+ )
245
+ # Also run verification when example is loaded
246
+ btn.click(
247
+ fn=verify_answer,
248
+ inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
249
+ outputs=result,
250
+ queue=False
251
+ )
252
+
253
+ # Advanced Settings in accordion
254
+ with gr.Accordion("Advanced Settings", open=False):
255
+ with gr.Row():
256
+ temp_slider = gr.Slider(0, 1, value=0.3, step=0.1, label="Temperature")
257
+ top_p_slider = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
258
+ max_tokens_slider = gr.Slider(1, 128, value=2, step=1, label="Max Tokens")
259
+
260
+ # Connect sliders to state values
261
+ temp_slider.change(lambda x: x, inputs=[temp_slider], outputs=[temperature])
262
+ top_p_slider.change(lambda x: x, inputs=[top_p_slider], outputs=[top_p])
263
+ max_tokens_slider.change(lambda x: x, inputs=[max_tokens_slider], outputs=[max_tokens])
264
+
265
+ # API usage in accordion
266
+ with gr.Accordion("API Usage", open=False):
267
  gr.Markdown(
268
  """
269
+ ```python
270
+ from gradio_client import Client
271
+
272
+ client = Client("zhangchenxu/TinyV")
273
+ result = client.predict(
274
+ question="What is the capital of France?",
275
+ ground_truth="The capital of France is Paris.",
276
+ model_answer="Paris is the capital of France.",
277
+ temperature=0.3,
278
+ top_p=0.95,
279
+ max_tokens=1,
280
+ api_name="/verify_answer"
281
+ )
282
+ print(result)
283
+ ```
284
+ """
285
  )
286
 
287
+ # Footer
288
+ gr.Markdown(
289
+ """
290
+ Powered by TinyV-1.5B model. This tool verifies semantic equivalence between answers, allowing for different formatting, ordering, notation, and phrasing.
291
+ """
292
+ )
293
+
294
  # Connect the interface to the verification function
295
  verify_btn.click(
296
  fn=verify_answer,