zhangchenxu commited on
Commit
406b1bf
·
1 Parent(s): b5ca948
Files changed (2) hide show
  1. README.md +72 -1
  2. app.py +180 -52
README.md CHANGED
@@ -10,4 +10,75 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  license: mit
11
  ---
12
 
13
+ # TinyV
14
+
15
+ This Hugging Face Space hosts an Answer Verification Tool (TinyV) powered by the `zhangchenxu/TinyV-1.5B` model. The tool is designed specifically for RL training to verify if a model's answer is semantically equivalent to a ground truth answer.
16
+
17
+ ## What This Tool Does
18
+
19
+ The Answer Verification Tool analyzes:
20
+ - A question
21
+ - A ground truth answer
22
+ - A model-generated answer
23
+
24
+ It then determines if the model's answer is correct, even if there are minor discrepancies in formatting or wording.
25
+
26
+ The verification is LLM-based rather than exact matching, which helps reduce false negatives in evaluation pipelines.
27
+
28
+ ## How to Use
29
+
30
+ ### Web Interface
31
+
32
+ 1. Enter the question in the first box
33
+ 2. Enter the ground truth answer
34
+ 3. Enter the model's answer to verify
35
+ 4. Adjust model parameters if needed (optional)
36
+ 5. Click "Verify Answer" to see the result
37
+
38
+ The tool will return:
39
+ - **True** if the model answer is correct
40
+ - **False** if the model answer is incorrect
41
+
42
+ ### API Usage
43
+
44
+ You can also use this tool via API:
45
+
46
+ ```python
47
+ from gradio_client import Client
48
+
49
+ client = Client("zhangchenxu/TinyV")
50
+ result = client.predict(
51
+ question="What is the capital of France?",
52
+ ground_truth="The capital of France is Paris.",
53
+ model_answer="Paris is the capital of France.",
54
+ temperature=0.3,
55
+ top_p=0.95,
56
+ max_tokens=128,
57
+ api_name="/verify"
58
+ )
59
+ print(result)
60
+ ```
61
+
62
+ ## Advanced Settings
63
+
64
+ - **Temperature**: Controls randomness. Lower values make output more deterministic (default: 0.3)
65
+ - **Top-p**: Controls diversity via nucleus sampling (default: 0.95)
66
+ - **Max Tokens**: Maximum tokens to generate in response (default: 128)
67
+
68
+ ## Model Information
69
+
70
+ This tool uses the `zhangchenxu/TinyV-1.5B` model, which has been optimized for answer verification tasks.
71
+
72
+ The model uses the following prompt template:
73
+
74
+ ```
75
+ You are an AI tasked with identifying false negatives in answer verification. A false negative occurs when a model's answer is essentially correct but is marked as incorrect due to minor discrepancies or formatting issues. Your job is to analyze the given question, ground truth answer, and model answer to determine if the model's answer is actually correct despite appearing different from the ground truth.
76
+
77
+ <question>{question}</question>
78
+
79
+ <ground_truth_answer>{ground_truth}</ground_truth_answer>
80
+
81
+ <model_answer>{model_answer}</model_answer>
82
+
83
+ Return "True" if the model's answer is correct, otherwise return "False".
84
+ ```
app.py CHANGED
@@ -1,64 +1,192 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
 
26
- messages.append({"role": "user", "content": message})
27
 
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
 
 
42
 
 
43
  """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
 
 
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
+ import time
4
 
5
+ # Initialize the client with your model
6
+ client = InferenceClient("zhangchenxu/TinyV-1.5B")
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # The prompt template for the LLM verifier
9
+ LV_PROMPT = """
10
+ You are an AI tasked with identifying false negatives in answer verification. A false negative occurs when a model's answer is essentially correct but is marked as incorrect due to minor discrepancies or formatting issues. Your job is to analyze the given question, ground truth answer, and model answer to determine if the model's answer is actually correct despite appearing different from the ground truth.
 
 
11
 
12
+ <question>{question}</question>
13
 
14
+ <ground_truth_answer>{ground_truth}</ground_truth_answer>
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ <model_answer>{model_answer}</model_answer>
17
 
18
+ Return "True" if the model's answer is correct, otherwise return "False".
19
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Main verification function
22
+ def verify_answer(question, ground_truth, model_answer, temperature, top_p, max_tokens):
23
+ # Format the prompt with user inputs
24
+ prompt = LV_PROMPT.format(
25
+ question=question,
26
+ ground_truth=ground_truth,
27
+ model_answer=model_answer
28
+ )
29
+
30
+ # Prepare the message format required by the API
31
+ messages = [
32
+ {"role": "system", "content": "You are a helpful AI assistant that verifies answers."},
33
+ {"role": "user", "content": prompt}
34
+ ]
35
+
36
+ # Initialize response
37
+ response_text = ""
38
+
39
+ try:
40
+ # Stream the response for better UX
41
+ for message in client.chat_completion(
42
+ messages,
43
+ max_tokens=max_tokens,
44
+ stream=True,
45
+ temperature=temperature,
46
+ top_p=top_p,
47
+ ):
48
+ token = message.choices[0].delta.content
49
+ if token:
50
+ response_text += token
51
+ yield response_text
52
+ except Exception as e:
53
+ yield f"Error: {str(e)}"
54
+
55
+ # Create the Gradio interface
56
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="Answer Verification Tool") as demo:
57
+ # Header with title and description
58
+ with gr.Row():
59
+ with gr.Column(scale=1):
60
+ gr.Image("https://huggingface.co/front/assets/huggingface_logo.svg", scale=1, show_label=False, height=64)
61
+ with gr.Column(scale=5):
62
+ gr.Markdown(
63
+ """
64
+ # Answer Verification Tool
65
+
66
+ This tool verifies if an answer is correct compared to a ground truth answer, even if there are minor differences in formatting or wording.
67
+ """
68
+ )
69
+
70
+ # Main interface
71
+ with gr.Row():
72
+ with gr.Column(scale=1):
73
+ gr.Markdown(
74
+ """
75
+ ## How to Use
76
+
77
+ 1. Enter the question in the first box
78
+ 2. Enter the ground truth answer
79
+ 3. Enter the model's answer to verify
80
+ 4. Adjust model parameters if needed
81
+ 5. Click "Verify Answer" to see the result
82
+
83
+ ### What this tool does
84
+
85
+ This tool determines if a model's answer is semantically correct compared to a ground truth answer, even if there are minor discrepancies in formatting or wording.
86
+
87
+ The model analyzes both answers and returns:
88
+ - **True** if the model answer is correct
89
+ - **False** if the model answer is incorrect
90
+
91
+ ### API Usage Example
92
+ ```python
93
+ from gradio_client import Client
94
+
95
+ client = Client("zhangchenxu/TinyV")
96
+ result = client.predict(
97
+ question="What is the capital of France?",
98
+ ground_truth="The capital of France is Paris.",
99
+ model_answer="Paris is the capital of France.",
100
+ temperature=0.3,
101
+ top_p=0.95,
102
+ max_tokens=128,
103
+ api_name="/verify"
104
+ )
105
+ print(result)
106
+ ```
107
+ """
108
+ )
109
+
110
+ # Model parameters (hidden in a collapsible section)
111
+ with gr.Accordion("Advanced Settings", open=False):
112
+ temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.1, label="Temperature")
113
+ top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
114
+ max_tokens = gr.Slider(minimum=32, maximum=512, value=128, step=32, label="Max Tokens")
115
+
116
+ with gr.Column(scale=1):
117
+ with gr.Box():
118
+ gr.Markdown("## Input")
119
+ question = gr.Textbox(lines=3, label="Question", placeholder="Enter the question here...")
120
+ ground_truth = gr.Textbox(lines=5, label="Ground Truth Answer", placeholder="Enter the correct answer here...")
121
+ model_answer = gr.Textbox(lines=5, label="Model Answer", placeholder="Enter the answer to verify here...")
122
+
123
+ verify_btn = gr.Button("Verify Answer", variant="primary")
124
+
125
+ gr.Markdown("## Result")
126
+ result = gr.Textbox(label="Verification Result", placeholder="Result will appear here...", lines=5)
127
+
128
+ # Connect the interface to the verification function
129
+ verify_btn.click(
130
+ verify_answer,
131
+ inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
132
+ outputs=result
133
+ )
134
+
135
+ # Examples
136
+ with gr.Accordion("Examples", open=True):
137
+ gr.Examples(
138
+ examples=[
139
+ [
140
+ "What is the capital of France?",
141
+ "The capital of France is Paris.",
142
+ "Paris is the capital of France.",
143
+ 0.3,
144
+ 0.95,
145
+ 128,
146
+ ],
147
+ [
148
+ "What is 2+2?",
149
+ "4",
150
+ "The answer is 4.",
151
+ 0.3,
152
+ 0.95,
153
+ 128,
154
+ ],
155
+ [
156
+ "When was the Declaration of Independence signed?",
157
+ "July 4, 1776",
158
+ "The Declaration of Independence was signed on July 4th, 1776.",
159
+ 0.3,
160
+ 0.95,
161
+ 128,
162
+ ],
163
+ [
164
+ "List the first three planets from the sun.",
165
+ "Mercury, Venus, Earth",
166
+ "The first three planets from the sun are Mercury, Venus, and Earth.",
167
+ 0.3,
168
+ 0.95,
169
+ 128,
170
+ ],
171
+ ],
172
+ inputs=[question, ground_truth, model_answer, temperature, top_p, max_tokens],
173
+ outputs=result,
174
+ )
175
+
176
+ # Add footer with extra information
177
+ with gr.Row():
178
+ gr.Markdown(
179
+ """
180
+ ### About
181
+ This tool uses the zhangchenxu/TinyV-1.5B model to verify answers.
182
+
183
+ The verification is based on semantic similarity rather than exact matching,
184
+ allowing for different phrasings and formats of the same correct answer.
185
+ """
186
+ )
187
 
188
+ # Define the public API
189
+ demo.queue()
190
+ # Launch the app
191
  if __name__ == "__main__":
192
+ demo.launch()