Spaces:
Sleeping
Sleeping
Commit
·
fbc8f04
1
Parent(s):
8459fca
update
Browse files
app.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
|
|
3 |
|
4 |
-
# Initialize the client
|
5 |
client = InferenceClient("zhangchenxu/TinyV-1.5B")
|
6 |
|
|
|
7 |
LV_PROMPT = """
|
8 |
You are an AI tasked with identifying false negatives in answer verification. A false negative occurs when a model's answer is essentially correct but is marked as incorrect due to minor discrepancies or formatting issues. Your job is to analyze the given question, ground truth answer, and model answer to determine if the model's answer is actually correct despite appearing different from the ground truth.
|
9 |
|
@@ -25,7 +27,7 @@ EXAMPLES = [
|
|
25 |
"model_answer": "-10, -8, -6",
|
26 |
"temp": 0.3,
|
27 |
"top_p": 0.95,
|
28 |
-
"tokens":
|
29 |
},
|
30 |
{
|
31 |
"name": "Latex Expression",
|
@@ -34,7 +36,7 @@ EXAMPLES = [
|
|
34 |
"model_answer": "4/7",
|
35 |
"temp": 0.3,
|
36 |
"top_p": 0.95,
|
37 |
-
"tokens":
|
38 |
},
|
39 |
{
|
40 |
"name": "Variable Labeling",
|
@@ -43,7 +45,7 @@ EXAMPLES = [
|
|
43 |
"model_answer": "b=-3, c=0",
|
44 |
"temp": 0.3,
|
45 |
"top_p": 0.95,
|
46 |
-
"tokens":
|
47 |
},
|
48 |
{
|
49 |
"name": "Paraphrase",
|
@@ -52,7 +54,7 @@ EXAMPLES = [
|
|
52 |
"model_answer": "Yes, Peter can guarantee finding 5 genuine coins while ensuring that none of these genuine coins are paid to Vasya.",
|
53 |
"temp": 0.3,
|
54 |
"top_p": 0.95,
|
55 |
-
"tokens":
|
56 |
},
|
57 |
{
|
58 |
"name": "False Example",
|
@@ -61,19 +63,27 @@ EXAMPLES = [
|
|
61 |
"model_answer": "K2 is the tallest mountain on Earth.",
|
62 |
"temp": 0.3,
|
63 |
"top_p": 0.95,
|
64 |
-
"tokens":
|
65 |
}
|
66 |
]
|
67 |
|
|
|
68 |
def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_tokens):
|
|
|
69 |
prompt = LV_PROMPT.format(
|
70 |
question=question,
|
71 |
ground_truth=ground_truth,
|
72 |
model_answer=model_answer
|
73 |
)
|
|
|
|
|
74 |
messages = [{"role": "user", "content": prompt}]
|
|
|
|
|
75 |
response_text = ""
|
|
|
76 |
try:
|
|
|
77 |
for message in client.chat_completion(
|
78 |
messages,
|
79 |
max_tokens=max_tokens,
|
@@ -88,6 +98,7 @@ def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_
|
|
88 |
except Exception as e:
|
89 |
yield f"Error: {str(e)}"
|
90 |
|
|
|
91 |
def load_example(example_index):
|
92 |
example = EXAMPLES[example_index]
|
93 |
return (
|
@@ -99,44 +110,128 @@ def load_example(example_index):
|
|
99 |
example["tokens"]
|
100 |
)
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
with gr.Row():
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
verify_btn.click(
|
135 |
fn=verify_answer,
|
136 |
inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
|
137 |
outputs=result
|
138 |
)
|
139 |
|
|
|
140 |
demo.queue()
|
|
|
|
|
141 |
if __name__ == "__main__":
|
142 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
3 |
+
import time
|
4 |
|
5 |
+
# Initialize the client with your model
|
6 |
client = InferenceClient("zhangchenxu/TinyV-1.5B")
|
7 |
|
8 |
+
# The prompt template for the LLM verifier
|
9 |
LV_PROMPT = """
|
10 |
You are an AI tasked with identifying false negatives in answer verification. A false negative occurs when a model's answer is essentially correct but is marked as incorrect due to minor discrepancies or formatting issues. Your job is to analyze the given question, ground truth answer, and model answer to determine if the model's answer is actually correct despite appearing different from the ground truth.
|
11 |
|
|
|
27 |
"model_answer": "-10, -8, -6",
|
28 |
"temp": 0.3,
|
29 |
"top_p": 0.95,
|
30 |
+
"tokens": 1
|
31 |
},
|
32 |
{
|
33 |
"name": "Latex Expression",
|
|
|
36 |
"model_answer": "4/7",
|
37 |
"temp": 0.3,
|
38 |
"top_p": 0.95,
|
39 |
+
"tokens": 1
|
40 |
},
|
41 |
{
|
42 |
"name": "Variable Labeling",
|
|
|
45 |
"model_answer": "b=-3, c=0",
|
46 |
"temp": 0.3,
|
47 |
"top_p": 0.95,
|
48 |
+
"tokens": 1
|
49 |
},
|
50 |
{
|
51 |
"name": "Paraphrase",
|
|
|
54 |
"model_answer": "Yes, Peter can guarantee finding 5 genuine coins while ensuring that none of these genuine coins are paid to Vasya.",
|
55 |
"temp": 0.3,
|
56 |
"top_p": 0.95,
|
57 |
+
"tokens": 1
|
58 |
},
|
59 |
{
|
60 |
"name": "False Example",
|
|
|
63 |
"model_answer": "K2 is the tallest mountain on Earth.",
|
64 |
"temp": 0.3,
|
65 |
"top_p": 0.95,
|
66 |
+
"tokens": 1
|
67 |
}
|
68 |
]
|
69 |
|
70 |
+
# Verification function
|
71 |
def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_tokens):
|
72 |
+
# Format the prompt with user inputs
|
73 |
prompt = LV_PROMPT.format(
|
74 |
question=question,
|
75 |
ground_truth=ground_truth,
|
76 |
model_answer=model_answer
|
77 |
)
|
78 |
+
|
79 |
+
# Prepare messages for the API
|
80 |
messages = [{"role": "user", "content": prompt}]
|
81 |
+
|
82 |
+
# Initialize response
|
83 |
response_text = ""
|
84 |
+
|
85 |
try:
|
86 |
+
# Stream the response for better UX
|
87 |
for message in client.chat_completion(
|
88 |
messages,
|
89 |
max_tokens=max_tokens,
|
|
|
98 |
except Exception as e:
|
99 |
yield f"Error: {str(e)}"
|
100 |
|
101 |
+
# Function to load an example
|
102 |
def load_example(example_index):
|
103 |
example = EXAMPLES[example_index]
|
104 |
return (
|
|
|
110 |
example["tokens"]
|
111 |
)
|
112 |
|
113 |
+
# Create the Gradio interface
|
114 |
+
with gr.Blocks(
|
115 |
+
theme=gr.themes.Soft(
|
116 |
+
primary_hue="blue",
|
117 |
+
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"]
|
118 |
+
),
|
119 |
+
css="""
|
120 |
+
.container { max-width: 1000px; margin: auto; }
|
121 |
+
.example-btn { min-width: 140px; }
|
122 |
+
.title { text-align: center; margin-bottom: 1rem; }
|
123 |
+
.result-box { min-height: 100px; }
|
124 |
+
"""
|
125 |
+
) as demo:
|
126 |
+
# Header
|
127 |
+
with gr.Group(elem_classes="container"):
|
128 |
+
gr.Markdown(
|
129 |
+
"""
|
130 |
+
# 🧠 TinyV - Answer Verification Tool
|
131 |
+
|
132 |
+
This tool verifies if a model-generated answer is semantically correct compared to a ground truth answer using a fine-tuned LLM.
|
133 |
+
""",
|
134 |
+
elem_classes="title"
|
135 |
+
)
|
136 |
+
|
137 |
+
# Main input area
|
138 |
+
with gr.Row(equal_height=True):
|
139 |
+
# Left column - Inputs
|
140 |
+
with gr.Column():
|
141 |
+
question = gr.Textbox(
|
142 |
+
lines=3,
|
143 |
+
label="📘 Question",
|
144 |
+
placeholder="Enter the question here..."
|
145 |
+
)
|
146 |
+
ground_truth = gr.Textbox(
|
147 |
+
lines=3,
|
148 |
+
label="✅ Ground Truth Answer",
|
149 |
+
placeholder="Enter the correct answer here..."
|
150 |
+
)
|
151 |
+
model_answer = gr.Textbox(
|
152 |
+
lines=3,
|
153 |
+
label="🤖 Model Answer",
|
154 |
+
placeholder="Enter the answer to verify here..."
|
155 |
+
)
|
156 |
+
|
157 |
+
# Right column - Result and verification button
|
158 |
+
with gr.Column():
|
159 |
+
verify_btn = gr.Button("✅ Verify Answer", variant="primary", size="lg")
|
160 |
+
result = gr.Textbox(
|
161 |
+
label="🧾 Verification Result",
|
162 |
+
placeholder="Result will appear here...",
|
163 |
+
lines=9,
|
164 |
+
elem_classes="result-box"
|
165 |
+
)
|
166 |
+
|
167 |
+
# Examples section
|
168 |
+
gr.Markdown("### 🔍 Try Examples:")
|
169 |
+
with gr.Row() as example_row:
|
170 |
+
for i, ex in enumerate(EXAMPLES):
|
171 |
+
btn = gr.Button(ex["name"], size="sm", elem_classes="example-btn")
|
172 |
+
btn.click(
|
173 |
+
fn=lambda idx=i: load_example(idx),
|
174 |
+
outputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens]
|
175 |
+
)
|
176 |
+
# Also run verification when example is loaded
|
177 |
+
btn.click(
|
178 |
+
fn=verify_answer,
|
179 |
+
inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
|
180 |
+
outputs=result,
|
181 |
+
queue=False
|
182 |
+
)
|
183 |
+
|
184 |
+
# Advanced Settings (hidden at the bottom)
|
185 |
+
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
186 |
with gr.Row():
|
187 |
+
temperature = gr.Slider(0, 1, value=0.3, step=0.1, label="Temperature")
|
188 |
+
top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
|
189 |
+
max_tokens = gr.Slider(1, 128, value=1, step=1, label="Max Tokens")
|
190 |
+
|
191 |
+
# About section
|
192 |
+
with gr.Accordion("ℹ️ About This Tool", open=False):
|
193 |
+
gr.Markdown(
|
194 |
+
"""
|
195 |
+
### What This Tool Does
|
196 |
+
|
197 |
+
This verification tool uses the TinyV-1.5B model to determine if answers are semantically equivalent,
|
198 |
+
even when they have different:
|
199 |
+
|
200 |
+
- **Formatting** (LaTeX vs. plain text, spacing, etc.)
|
201 |
+
- **Ordering** (e.g., listing items in different orders)
|
202 |
+
- **Phrasing** (paraphrases with the same meaning)
|
203 |
+
- **Variable labeling** (with or without variable names)
|
204 |
+
|
205 |
+
### API Usage
|
206 |
+
|
207 |
+
```python
|
208 |
+
from gradio_client import Client
|
209 |
+
|
210 |
+
client = Client("zhangchenxu/TinyV")
|
211 |
+
result = client.predict(
|
212 |
+
question="What is the capital of France?",
|
213 |
+
ground_truth="The capital of France is Paris.",
|
214 |
+
model_answer="Paris is the capital of France.",
|
215 |
+
temperature=0.3,
|
216 |
+
top_p=0.95,
|
217 |
+
max_tokens=2,
|
218 |
+
api_name="/verify_answer"
|
219 |
+
)
|
220 |
+
print(result)
|
221 |
+
```
|
222 |
+
"""
|
223 |
+
)
|
224 |
+
|
225 |
+
# Connect the interface to the verification function
|
226 |
verify_btn.click(
|
227 |
fn=verify_answer,
|
228 |
inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
|
229 |
outputs=result
|
230 |
)
|
231 |
|
232 |
+
# Define the public API
|
233 |
demo.queue()
|
234 |
+
|
235 |
+
# Launch the app
|
236 |
if __name__ == "__main__":
|
237 |
+
demo.launch()
|