Spaces:
Sleeping
Sleeping
Commit
·
406b1bf
1
Parent(s):
b5ca948
update
Browse files
README.md
CHANGED
@@ -10,4 +10,75 @@ pinned: false
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
+
# TinyV
|
14 |
+
|
15 |
+
This Hugging Face Space hosts an Answer Verification Tool (TinyV) powered by the `zhangchenxu/TinyV-1.5B` model. The tool is designed specifically for RL training to verify if a model's answer is semantically equivalent to a ground truth answer.
|
16 |
+
|
17 |
+
## What This Tool Does
|
18 |
+
|
19 |
+
The Answer Verification Tool analyzes:
|
20 |
+
- A question
|
21 |
+
- A ground truth answer
|
22 |
+
- A model-generated answer
|
23 |
+
|
24 |
+
It then determines if the model's answer is correct, even if there are minor discrepancies in formatting or wording.
|
25 |
+
|
26 |
+
The verification is LLM-based rather than exact matching, which helps reduce false negatives in evaluation pipelines.
|
27 |
+
|
28 |
+
## How to Use
|
29 |
+
|
30 |
+
### Web Interface
|
31 |
+
|
32 |
+
1. Enter the question in the first box
|
33 |
+
2. Enter the ground truth answer
|
34 |
+
3. Enter the model's answer to verify
|
35 |
+
4. Adjust model parameters if needed (optional)
|
36 |
+
5. Click "Verify Answer" to see the result
|
37 |
+
|
38 |
+
The tool will return:
|
39 |
+
- **True** if the model answer is correct
|
40 |
+
- **False** if the model answer is incorrect
|
41 |
+
|
42 |
+
### API Usage
|
43 |
+
|
44 |
+
You can also use this tool via API:
|
45 |
+
|
46 |
+
```python
|
47 |
+
from gradio_client import Client
|
48 |
+
|
49 |
+
client = Client("zhangchenxu/TinyV")
|
50 |
+
result = client.predict(
|
51 |
+
question="What is the capital of France?",
|
52 |
+
ground_truth="The capital of France is Paris.",
|
53 |
+
model_answer="Paris is the capital of France.",
|
54 |
+
temperature=0.3,
|
55 |
+
top_p=0.95,
|
56 |
+
max_tokens=128,
|
57 |
+
api_name="/verify"
|
58 |
+
)
|
59 |
+
print(result)
|
60 |
+
```
|
61 |
+
|
62 |
+
## Advanced Settings
|
63 |
+
|
64 |
+
- **Temperature**: Controls randomness. Lower values make output more deterministic (default: 0.3)
|
65 |
+
- **Top-p**: Controls diversity via nucleus sampling (default: 0.95)
|
66 |
+
- **Max Tokens**: Maximum tokens to generate in response (default: 128)
|
67 |
+
|
68 |
+
## Model Information
|
69 |
+
|
70 |
+
This tool uses the `zhangchenxu/TinyV-1.5B` model, which has been optimized for answer verification tasks.
|
71 |
+
|
72 |
+
The model uses the following prompt template:
|
73 |
+
|
74 |
+
```
|
75 |
+
You are an AI tasked with identifying false negatives in answer verification. A false negative occurs when a model's answer is essentially correct but is marked as incorrect due to minor discrepancies or formatting issues. Your job is to analyze the given question, ground truth answer, and model answer to determine if the model's answer is actually correct despite appearing different from the ground truth.
|
76 |
+
|
77 |
+
<question>{question}</question>
|
78 |
+
|
79 |
+
<ground_truth_answer>{ground_truth}</ground_truth_answer>
|
80 |
+
|
81 |
+
<model_answer>{model_answer}</model_answer>
|
82 |
+
|
83 |
+
Return "True" if the model's answer is correct, otherwise return "False".
|
84 |
+
```
|
app.py
CHANGED
@@ -1,64 +1,192 @@
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
|
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
"""
|
7 |
-
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
|
8 |
-
|
9 |
-
|
10 |
-
def respond(
|
11 |
-
message,
|
12 |
-
history: list[tuple[str, str]],
|
13 |
-
system_message,
|
14 |
-
max_tokens,
|
15 |
-
temperature,
|
16 |
-
top_p,
|
17 |
-
):
|
18 |
-
messages = [{"role": "system", "content": system_message}]
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
if val[1]:
|
24 |
-
messages.append({"role": "assistant", "content": val[1]})
|
25 |
|
26 |
-
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
for message in client.chat_completion(
|
31 |
-
messages,
|
32 |
-
max_tokens=max_tokens,
|
33 |
-
stream=True,
|
34 |
-
temperature=temperature,
|
35 |
-
top_p=top_p,
|
36 |
-
):
|
37 |
-
token = message.choices[0].delta.content
|
38 |
-
|
39 |
-
response += token
|
40 |
-
yield response
|
41 |
|
|
|
42 |
|
|
|
43 |
"""
|
44 |
-
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
45 |
-
"""
|
46 |
-
demo = gr.ChatInterface(
|
47 |
-
respond,
|
48 |
-
additional_inputs=[
|
49 |
-
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
50 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
51 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
52 |
-
gr.Slider(
|
53 |
-
minimum=0.1,
|
54 |
-
maximum=1.0,
|
55 |
-
value=0.95,
|
56 |
-
step=0.05,
|
57 |
-
label="Top-p (nucleus sampling)",
|
58 |
-
),
|
59 |
-
],
|
60 |
-
)
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
|
|
|
|
|
|
63 |
if __name__ == "__main__":
|
64 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
3 |
+
import time
|
4 |
|
5 |
+
# Initialize the client with your model
|
6 |
+
client = InferenceClient("zhangchenxu/TinyV-1.5B")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
# The prompt template for the LLM verifier
|
9 |
+
LV_PROMPT = """
|
10 |
+
You are an AI tasked with identifying false negatives in answer verification. A false negative occurs when a model's answer is essentially correct but is marked as incorrect due to minor discrepancies or formatting issues. Your job is to analyze the given question, ground truth answer, and model answer to determine if the model's answer is actually correct despite appearing different from the ground truth.
|
|
|
|
|
11 |
|
12 |
+
<question>{question}</question>
|
13 |
|
14 |
+
<ground_truth_answer>{ground_truth}</ground_truth_answer>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
<model_answer>{model_answer}</model_answer>
|
17 |
|
18 |
+
Return "True" if the model's answer is correct, otherwise return "False".
|
19 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
# Main verification function
|
22 |
+
def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_tokens):
|
23 |
+
# Format the prompt with user inputs
|
24 |
+
prompt = LV_PROMPT.format(
|
25 |
+
question=question,
|
26 |
+
ground_truth=ground_truth,
|
27 |
+
model_answer=model_answer
|
28 |
+
)
|
29 |
+
|
30 |
+
# Prepare the message format required by the API
|
31 |
+
messages = [
|
32 |
+
{"role": "system", "content": "You are a helpful AI assistant that verifies answers."},
|
33 |
+
{"role": "user", "content": prompt}
|
34 |
+
]
|
35 |
+
|
36 |
+
# Initialize response
|
37 |
+
response_text = ""
|
38 |
+
|
39 |
+
try:
|
40 |
+
# Stream the response for better UX
|
41 |
+
for message in client.chat_completion(
|
42 |
+
messages,
|
43 |
+
max_tokens=max_tokens,
|
44 |
+
stream=True,
|
45 |
+
temperature=temperature,
|
46 |
+
top_p=top_p,
|
47 |
+
):
|
48 |
+
token = message.choices[0].delta.content
|
49 |
+
if token:
|
50 |
+
response_text += token
|
51 |
+
yield response_text
|
52 |
+
except Exception as e:
|
53 |
+
yield f"Error: {str(e)}"
|
54 |
+
|
55 |
+
# Create the Gradio interface
|
56 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="Answer Verification Tool") as demo:
|
57 |
+
# Header with title and description
|
58 |
+
with gr.Row():
|
59 |
+
with gr.Column(scale=1):
|
60 |
+
gr.Image("https://huggingface.co/front/assets/huggingface_logo.svg", scale=1, show_label=False, height=64)
|
61 |
+
with gr.Column(scale=5):
|
62 |
+
gr.Markdown(
|
63 |
+
"""
|
64 |
+
# Answer Verification Tool
|
65 |
+
|
66 |
+
This tool verifies if an answer is correct compared to a ground truth answer, even if there are minor differences in formatting or wording.
|
67 |
+
"""
|
68 |
+
)
|
69 |
+
|
70 |
+
# Main interface
|
71 |
+
with gr.Row():
|
72 |
+
with gr.Column(scale=1):
|
73 |
+
gr.Markdown(
|
74 |
+
"""
|
75 |
+
## How to Use
|
76 |
+
|
77 |
+
1. Enter the question in the first box
|
78 |
+
2. Enter the ground truth answer
|
79 |
+
3. Enter the model's answer to verify
|
80 |
+
4. Adjust model parameters if needed
|
81 |
+
5. Click "Verify Answer" to see the result
|
82 |
+
|
83 |
+
### What this tool does
|
84 |
+
|
85 |
+
This tool determines if a model's answer is semantically correct compared to a ground truth answer, even if there are minor discrepancies in formatting or wording.
|
86 |
+
|
87 |
+
The model analyzes both answers and returns:
|
88 |
+
- **True** if the model answer is correct
|
89 |
+
- **False** if the model answer is incorrect
|
90 |
+
|
91 |
+
### API Usage Example
|
92 |
+
```python
|
93 |
+
from gradio_client import Client
|
94 |
+
|
95 |
+
client = Client("zhangchenxu/TinyV")
|
96 |
+
result = client.predict(
|
97 |
+
question="What is the capital of France?",
|
98 |
+
ground_truth="The capital of France is Paris.",
|
99 |
+
model_answer="Paris is the capital of France.",
|
100 |
+
temperature=0.3,
|
101 |
+
top_p=0.95,
|
102 |
+
max_tokens=128,
|
103 |
+
api_name="/verify"
|
104 |
+
)
|
105 |
+
print(result)
|
106 |
+
```
|
107 |
+
"""
|
108 |
+
)
|
109 |
+
|
110 |
+
# Model parameters (hidden in a collapsible section)
|
111 |
+
with gr.Accordion("Advanced Settings", open=False):
|
112 |
+
temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.1, label="Temperature")
|
113 |
+
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
|
114 |
+
max_tokens = gr.Slider(minimum=32, maximum=512, value=128, step=32, label="Max Tokens")
|
115 |
+
|
116 |
+
with gr.Column(scale=1):
|
117 |
+
with gr.Box():
|
118 |
+
gr.Markdown("## Input")
|
119 |
+
question = gr.Textbox(lines=3, label="Question", placeholder="Enter the question here...")
|
120 |
+
ground_truth = gr.Textbox(lines=5, label="Ground Truth Answer", placeholder="Enter the correct answer here...")
|
121 |
+
model_answer = gr.Textbox(lines=5, label="Model Answer", placeholder="Enter the answer to verify here...")
|
122 |
+
|
123 |
+
verify_btn = gr.Button("Verify Answer", variant="primary")
|
124 |
+
|
125 |
+
gr.Markdown("## Result")
|
126 |
+
result = gr.Textbox(label="Verification Result", placeholder="Result will appear here...", lines=5)
|
127 |
+
|
128 |
+
# Connect the interface to the verification function
|
129 |
+
verify_btn.click(
|
130 |
+
verify_answer,
|
131 |
+
inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
|
132 |
+
outputs=result
|
133 |
+
)
|
134 |
+
|
135 |
+
# Examples
|
136 |
+
with gr.Accordion("Examples", open=True):
|
137 |
+
gr.Examples(
|
138 |
+
examples=[
|
139 |
+
[
|
140 |
+
"What is the capital of France?",
|
141 |
+
"The capital of France is Paris.",
|
142 |
+
"Paris is the capital of France.",
|
143 |
+
0.3,
|
144 |
+
0.95,
|
145 |
+
128,
|
146 |
+
],
|
147 |
+
[
|
148 |
+
"What is 2+2?",
|
149 |
+
"4",
|
150 |
+
"The answer is 4.",
|
151 |
+
0.3,
|
152 |
+
0.95,
|
153 |
+
128,
|
154 |
+
],
|
155 |
+
[
|
156 |
+
"When was the Declaration of Independence signed?",
|
157 |
+
"July 4, 1776",
|
158 |
+
"The Declaration of Independence was signed on July 4th, 1776.",
|
159 |
+
0.3,
|
160 |
+
0.95,
|
161 |
+
128,
|
162 |
+
],
|
163 |
+
[
|
164 |
+
"List the first three planets from the sun.",
|
165 |
+
"Mercury, Venus, Earth",
|
166 |
+
"The first three planets from the sun are Mercury, Venus, and Earth.",
|
167 |
+
0.3,
|
168 |
+
0.95,
|
169 |
+
128,
|
170 |
+
],
|
171 |
+
],
|
172 |
+
inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
|
173 |
+
outputs=result,
|
174 |
+
)
|
175 |
+
|
176 |
+
# Add footer with extra information
|
177 |
+
with gr.Row():
|
178 |
+
gr.Markdown(
|
179 |
+
"""
|
180 |
+
### About
|
181 |
+
This tool uses the zhangchenxu/TinyV-1.5B model to verify answers.
|
182 |
+
|
183 |
+
The verification is based on semantic similarity rather than exact matching,
|
184 |
+
allowing for different phrasings and formats of the same correct answer.
|
185 |
+
"""
|
186 |
+
)
|
187 |
|
188 |
+
# Define the public API
|
189 |
+
demo.queue()
|
190 |
+
# Launch the app
|
191 |
if __name__ == "__main__":
|
192 |
+
demo.launch()
|