zhangchenxu commited on
Commit
b17581e
·
1 Parent(s): cf3d2b0
Files changed (1) hide show
  1. app.py +90 -70
app.py CHANGED
@@ -18,6 +18,55 @@ You are an AI tasked with identifying false negatives in answer verification. A
18
  Return "True" if the model's answer is correct, otherwise return "False".
19
  """
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # Main verification function
22
  def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_tokens):
23
  # Format the prompt with user inputs
@@ -51,16 +100,28 @@ def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_
51
  except Exception as e:
52
  yield f"Error: {str(e)}"
53
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  # Create the Gradio interface
55
- with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"]), title="Answer Verification Tool") as demo:
56
  # Header with title and description
57
  with gr.Row():
58
  with gr.Column():
59
  gr.Markdown(
60
  """
61
- # Answer Verification Tool
62
 
63
- This tool verifies if an answer is correct compared to a ground truth answer, even if there are minor differences in formatting or wording.
64
  """
65
  )
66
 
@@ -79,7 +140,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", font=[gr.themes.GoogleFo
79
 
80
  ### What this tool does
81
 
82
- This tool determines if a model's answer is semantically correct compared to a ground truth answer, even if there are minor discrepancies in formatting or wording.
83
 
84
  The model analyzes both answers and returns:
85
  - **True** if the model answer is correct
@@ -91,9 +152,9 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", font=[gr.themes.GoogleFo
91
 
92
  client = Client("zhangchenxu/TinyV")
93
  result = client.predict(
94
- question="What is the capital of France?",
95
- ground_truth="The capital of France is Paris.",
96
- model_answer="Paris is the capital of France.",
97
  temperature=0.3,
98
  top_p=0.95,
99
  max_tokens=1,
@@ -108,7 +169,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", font=[gr.themes.GoogleFo
108
  with gr.Accordion("Advanced Settings", open=False):
109
  temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.1, label="Temperature")
110
  top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
111
- max_tokens = gr.Slider(minimum=32, maximum=512, value=1, step=32, label="Max Tokens")
112
 
113
  with gr.Column(scale=1):
114
  gr.Markdown("## Input")
@@ -116,6 +177,19 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", font=[gr.themes.GoogleFo
116
  ground_truth = gr.Textbox(lines=5, label="Ground Truth Answer", placeholder="Enter the correct answer here...")
117
  model_answer = gr.Textbox(lines=5, label="Model Answer", placeholder="Enter the answer to verify here...")
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  verify_btn = gr.Button("Verify Answer", variant="primary")
120
 
121
  gr.Markdown("## Result")
@@ -128,68 +202,14 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", font=[gr.themes.GoogleFo
128
  outputs=result
129
  )
130
 
131
- # Examples section - improved styling
132
- with gr.Accordion("Examples", open=True):
133
- with gr.Row():
134
- gr.Examples(
135
- examples=[
136
- [
137
- "What is the capital of France?",
138
- "The capital of France is Paris.",
139
- "Paris is the capital of France.",
140
- 0.3,
141
- 0.95,
142
- 2,
143
- ],
144
- [
145
- "What is 2+2?",
146
- "4",
147
- "Four.",
148
- 0.3,
149
- 0.95,
150
- 2,
151
- ],
152
- ],
153
- inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
154
- outputs=result,
155
- fn=verify_answer,
156
- cache_examples=True,
157
- )
158
-
159
- with gr.Row():
160
- gr.Examples(
161
- examples=[
162
- [
163
- "When was the Declaration of Independence signed?",
164
- "July 4, 1776",
165
- "The Declaration of Independence was signed on July 4th, 1776.",
166
- 0.3,
167
- 0.95,
168
- 2,
169
- ],
170
- [
171
- "List the first three planets from the sun.",
172
- "Mercury, Venus, Earth",
173
- "The first three planets from the sun are Mercury, Venus, and Earth.",
174
- 0.3,
175
- 0.95,
176
- 2,
177
- ],
178
- ],
179
- inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
180
- outputs=result,
181
- fn=verify_answer,
182
- cache_examples=True,
183
- )
184
-
185
- # Add footer with extra information
186
- with gr.Row(equal_height=True):
187
- gr.Markdown(
188
- """
189
- ### About
190
- This tool uses the zhangchenxu/TinyV-1.5B model to verify the correctness of the answers,
191
- allowing for different phrasings of correct answers.
192
- """
193
  )
194
 
195
  # Define the public API
 
18
  Return "True" if the model's answer is correct, otherwise return "False".
19
  """
20
 
21
+ # Define our example sets
22
+ EXAMPLES = [
23
+ {
24
+ "name": "Order-Insensitive",
25
+ "question": "Determine all real values of $x$ for which $(x+8)^{4}=(2 x+16)^{2}$.",
26
+ "ground_truth": "-6,-8,-10",
27
+ "model_answer": "-10, -8, -6",
28
+ "temp": 0.3,
29
+ "top_p": 0.95,
30
+ "tokens": 2
31
+ },
32
+ {
33
+ "name": "Latex Expression",
34
+ "question": "A bag contains 3 green balls, 4 red balls, and no other balls. Victor removes balls randomly from the bag, one at a time, and places them on a table. Each ball in the bag is equally likely to be chosen each time that he removes a ball. He stops removing balls when there are two balls of the same colour on the table. What is the probability that, when he stops, there is at least 1 red ball and at least 1 green ball on the table?",
35
+ "ground_truth": "$\\frac{4}{7}$",
36
+ "model_answer": "4/7",
37
+ "temp": 0.3,
38
+ "top_p": 0.95,
39
+ "tokens": 2
40
+ },
41
+ {
42
+ "name": "Variable Labeling",
43
+ "question": "If $T=x^{2}+\\frac{1}{x^{2}}$, determine the values of $b$ and $c$ so that $x^{6}+\\frac{1}{x^{6}}=T^{3}+b T+c$ for all non-zero real numbers $x$.",
44
+ "ground_truth": "-3,0",
45
+ "model_answer": "b=-3, c=0",
46
+ "temp": 0.3,
47
+ "top_p": 0.95,
48
+ "tokens": 2
49
+ },
50
+ {
51
+ "name": "Paraphrase",
52
+ "question": "Peter has 8 coins, of which he knows that 7 are genuine and weigh the same, while one is fake and differs in weight, though he does not know whether it is heavier or lighter. Peter has access to a balance scale, which shows which side is heavier but not by how much. For each weighing, Peter must pay Vasya one of his coins before the weighing. If Peter pays with a genuine coin, Vasya will provide an accurate result; if a fake coin is used, Vasya will provide a random result. Peter wants to determine 5 genuine coins and ensure that none of these genuine coins are given to Vasya. Can Peter guaranteedly achieve this?",
53
+ "ground_truth": "Petya can guarantee finding 5 genuine coins.",
54
+ "model_answer": "Yes, Peter can guarantee finding 5 genuine coins while ensuring that none of these genuine coins are paid to Vasya.",
55
+ "temp": 0.3,
56
+ "top_p": 0.95,
57
+ "tokens": 2
58
+ },
59
+ {
60
+ "name": "False Example",
61
+ "question": "What is the tallest mountain in the world?",
62
+ "ground_truth": "Mount Everest is the tallest mountain in the world.",
63
+ "model_answer": "K2 is the tallest mountain on Earth.",
64
+ "temp": 0.3,
65
+ "top_p": 0.95,
66
+ "tokens": 2
67
+ }
68
+ ]
69
+
70
  # Main verification function
71
  def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_tokens):
72
  # Format the prompt with user inputs
 
100
  except Exception as e:
101
  yield f"Error: {str(e)}"
102
 
103
+ # Function to load an example when its button is clicked
104
+ def load_example(example_index):
105
+ example = EXAMPLES[example_index]
106
+ return (
107
+ example["question"],
108
+ example["ground_truth"],
109
+ example["model_answer"],
110
+ example["temp"],
111
+ example["top_p"],
112
+ example["tokens"]
113
+ )
114
+
115
  # Create the Gradio interface
116
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"]), title="TinyV") as demo:
117
  # Header with title and description
118
  with gr.Row():
119
  with gr.Column():
120
  gr.Markdown(
121
  """
122
+ # TinyV - Answer Verification Tool
123
 
124
+ This tool verifies if an answer is correct compared to a ground truth answer for RL.
125
  """
126
  )
127
 
 
140
 
141
  ### What this tool does
142
 
143
+ This tool determines if a model's answer is semantically correct compared to a ground truth answer using a fine-tuned LLM.
144
 
145
  The model analyzes both answers and returns:
146
  - **True** if the model answer is correct
 
152
 
153
  client = Client("zhangchenxu/TinyV")
154
  result = client.predict(
155
+ question="Determine all real values of $x$ for which $(x+8)^{4}=(2 x+16)^{2}$.",
156
+ ground_truth="-6,-8,-10",
157
+ model_answer="-10, -8, -6",
158
  temperature=0.3,
159
  top_p=0.95,
160
  max_tokens=1,
 
169
  with gr.Accordion("Advanced Settings", open=False):
170
  temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.1, label="Temperature")
171
  top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
172
+ max_tokens = gr.Slider(minimum=1, maximum=256, value=1, step=1, label="Max Tokens")
173
 
174
  with gr.Column(scale=1):
175
  gr.Markdown("## Input")
 
177
  ground_truth = gr.Textbox(lines=5, label="Ground Truth Answer", placeholder="Enter the correct answer here...")
178
  model_answer = gr.Textbox(lines=5, label="Model Answer", placeholder="Enter the answer to verify here...")
179
 
180
+ # Examples section as buttons
181
+ gr.Markdown("### Try an example:")
182
+ with gr.Row():
183
+ example_buttons = []
184
+ for i, example in enumerate(EXAMPLES):
185
+ btn = gr.Button(example["name"], size="sm")
186
+ example_buttons.append(btn)
187
+ # Connect each button to the load_example function
188
+ btn.click(
189
+ fn=lambda idx=i: load_example(idx),
190
+ outputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens]
191
+ )
192
+
193
  verify_btn = gr.Button("Verify Answer", variant="primary")
194
 
195
  gr.Markdown("## Result")
 
202
  outputs=result
203
  )
204
 
205
+ # Run verification when an example is loaded (optional)
206
+ for btn in example_buttons:
207
+ btn.click(
208
+ fn=verify_answer,
209
+ inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
210
+ outputs=result,
211
+ _js="() => {setTimeout(() => document.querySelector('#verify-btn').click(), 100)}",
212
+ queue=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  )
214
 
215
  # Define the public API