Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import time
|
4 |
+
import csv
|
5 |
+
import tempfile
|
6 |
+
import requests
|
7 |
+
import pandas as pd
|
8 |
+
import gradio as gr
|
9 |
+
|
10 |
+
######################################
|
11 |
+
# Environment / Secrets
|
12 |
+
######################################
|
13 |
+
|
14 |
+
#OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
15 |
+
#if not OPENAI_API_KEY:
|
16 |
+
# raise Exception("OPENAI_API_KEY not found in environment variables. Please add it as a secret in your Space.")
|
17 |
+
|
18 |
+
from google.colab import userdata
|
19 |
+
|
20 |
+
#COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
|
21 |
+
COHERE_API_KEY = userdata.get('cohere_key')
|
22 |
+
|
23 |
+
if not COHERE_API_KEY:
|
24 |
+
raise Exception("COHERE_API_KEY not found in environment variables. Please add it as a secret in your Space.")
|
25 |
+
|
26 |
+
#HF_API_TOKEN = os.environ.get("HF_TOKEN")
|
27 |
+
HF_API_TOKEN = userdata.get('hf_token')
|
28 |
+
hf_headers = {}
|
29 |
+
if HF_API_TOKEN:
|
30 |
+
hf_headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
|
31 |
+
|
32 |
+
######################################
|
33 |
+
# Load System Instructions
|
34 |
+
######################################
|
35 |
+
|
36 |
+
with open("system_instructions.txt", "r", encoding="utf-8") as f:
|
37 |
+
system_instructions = f.read()
|
38 |
+
|
39 |
+
######################################
|
40 |
+
# Helper Functions
|
41 |
+
######################################
|
42 |
+
|
43 |
+
def call_judge(prompt: str, max_tokens=200, temperature=0.7) -> str:
|
44 |
+
"""
|
45 |
+
Calls judge via Chat Completion API
|
46 |
+
and returns the model's text output.
|
47 |
+
"""
|
48 |
+
url = "https://api.cohere.ai/v1/generate"
|
49 |
+
headers = {
|
50 |
+
"Authorization": f"Bearer {COHERE_API_KEY}",
|
51 |
+
"Content-Type": "application/json"
|
52 |
+
}
|
53 |
+
payload = {
|
54 |
+
"model": "command-r-plus", # Adjust based on the desired Cohere model
|
55 |
+
"prompt": prompt,
|
56 |
+
"max_tokens": max_tokens,
|
57 |
+
"temperature": temperature
|
58 |
+
}
|
59 |
+
|
60 |
+
response = requests.post(url, json=payload, headers=headers)
|
61 |
+
if response.status_code != 200:
|
62 |
+
raise Exception(f"Cohere API error: {response.text}")
|
63 |
+
result = response.json()
|
64 |
+
return result["generations"][0]["text"]
|
65 |
+
|
66 |
+
def call_hf(model: str, prompt: str, max_new_tokens=200, max_retries=10, delay=5) -> str:
|
67 |
+
"""
|
68 |
+
Calls a Hugging Face Inference endpoint for text generation.
|
69 |
+
Retries if the model is still loading.
|
70 |
+
"""
|
71 |
+
api_url = f"https://api-inference.huggingface.co/models/{model}"
|
72 |
+
payload = {
|
73 |
+
"inputs": prompt,
|
74 |
+
"parameters": {
|
75 |
+
"do_sample": False,
|
76 |
+
"max_new_tokens": max_new_tokens
|
77 |
+
}
|
78 |
+
}
|
79 |
+
|
80 |
+
for attempt in range(max_retries):
|
81 |
+
resp = requests.post(api_url, json=payload, headers=hf_headers)
|
82 |
+
data = resp.json()
|
83 |
+
if isinstance(data, dict) and data.get("error"):
|
84 |
+
if "loading" in data["error"].lower():
|
85 |
+
print(f"Attempt {attempt+1}/{max_retries}: Model is loading. Retrying in {delay} seconds...")
|
86 |
+
time.sleep(delay)
|
87 |
+
else:
|
88 |
+
raise Exception(f"Error from model {model}: {data['error']}")
|
89 |
+
else:
|
90 |
+
# Data should be a list like [{ "generated_text": "..." }]
|
91 |
+
return data[0]["generated_text"]
|
92 |
+
raise Exception(f"Model {model} is still loading after {max_retries} attempts.")
|
93 |
+
|
94 |
+
def generate_answer(question: str, evaluated_model: str) -> str:
|
95 |
+
"""
|
96 |
+
Generates an answer for the question, using the specified evaluated model or 'placeholder' if empty.
|
97 |
+
"""
|
98 |
+
if evaluated_model.strip().lower() == "please enter model to evaluate":
|
99 |
+
return f"Placeholder answer for: {question}"
|
100 |
+
else:
|
101 |
+
return call_hf(evaluated_model, question)
|
102 |
+
|
103 |
+
def judge_answer(question: str, answer: str) -> int:
|
104 |
+
"""
|
105 |
+
Sends question+answer to the judge with system instructions to produce a numeric score (0 to 5).
|
106 |
+
"""
|
107 |
+
prompt = (
|
108 |
+
f"{system_instructions}\n\n"
|
109 |
+
f"Question: {question}\n"
|
110 |
+
f"Answer: {answer}\n\n"
|
111 |
+
"Please provide a score from 0 to 5, where 5 is perfect and 0 is entirely incorrect. "
|
112 |
+
"Provide only the numeric score in your response."
|
113 |
+
)
|
114 |
+
output = call_judge(prompt, max_tokens=200, temperature=0.7)
|
115 |
+
match = re.search(r"\b([0-5])\b", output)
|
116 |
+
if match:
|
117 |
+
return int(match.group(1))
|
118 |
+
return 0
|
119 |
+
|
120 |
+
######################################
|
121 |
+
# Main Evaluation
|
122 |
+
######################################
|
123 |
+
|
124 |
+
def evaluate_csv(csv_file, evaluated_model_name):
|
125 |
+
"""
|
126 |
+
Reads a CSV with a 'question' and a 'answer' column.
|
127 |
+
Scores each Q&A with the judge model (0..5).
|
128 |
+
Returns (avg_score_percent, csv_temp_path).
|
129 |
+
"""
|
130 |
+
df = pd.read_csv(csv_file)
|
131 |
+
if "question" not in df.columns:
|
132 |
+
raise ValueError("CSV must contain a 'question' column.")
|
133 |
+
|
134 |
+
has_answer_col = ("answer" in df.columns)
|
135 |
+
results = []
|
136 |
+
for _, row in df.iterrows():
|
137 |
+
q = str(row["question"])
|
138 |
+
if has_answer_col:
|
139 |
+
a = str(row["answer"])
|
140 |
+
else:
|
141 |
+
a = generate_answer(q, evaluated_model_name)
|
142 |
+
score = judge_answer(q, a)
|
143 |
+
results.append({"question": q, "answer": a, "score": score})
|
144 |
+
|
145 |
+
if len(results) == 0:
|
146 |
+
return 0.0, None
|
147 |
+
|
148 |
+
total_score = sum(item["score"] for item in results)
|
149 |
+
max_possible = len(results) * 5
|
150 |
+
avg_score_percent = (total_score / max_possible) * 100
|
151 |
+
|
152 |
+
# Build output CSV (comma-separated)
|
153 |
+
out_df = pd.DataFrame(results)
|
154 |
+
csv_str = out_df.to_csv(
|
155 |
+
index=False,
|
156 |
+
sep=',', # Comma separated
|
157 |
+
quotechar='"',
|
158 |
+
quoting=csv.QUOTE_ALL,
|
159 |
+
encoding='utf-8-sig'
|
160 |
+
)
|
161 |
+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv", encoding="utf-8-sig") as tmp_file:
|
162 |
+
tmp_file.write(csv_str)
|
163 |
+
tmp_file_path = tmp_file.name
|
164 |
+
|
165 |
+
return avg_score_percent, tmp_file_path
|
166 |
+
|
167 |
+
def run_evaluation(csv_file, evaluated_model_name):
|
168 |
+
"""
|
169 |
+
Gradio callback:
|
170 |
+
1) Evaluates Q&A from the CSV.
|
171 |
+
2) Returns a big box with % and a downloadable CSV.
|
172 |
+
"""
|
173 |
+
avg_percentage, csv_path = evaluate_csv(csv_file, evaluated_model_name)
|
174 |
+
# Build the same style box as the single Q&A will use
|
175 |
+
score_box = f"""
|
176 |
+
<div style="width:200px; height:200px; border:2px solid #333;
|
177 |
+
display:flex; align-items:center; justify-content:center; font-size:30px;">
|
178 |
+
{avg_percentage:.2f}%
|
179 |
+
</div>
|
180 |
+
"""
|
181 |
+
return score_box, csv_path
|
182 |
+
|
183 |
+
######################################
|
184 |
+
# Gradio Interface
|
185 |
+
######################################
|
186 |
+
|
187 |
+
with gr.Blocks() as demo:
|
188 |
+
####################################
|
189 |
+
# Top row: Logo (left), Title + instructions (right)
|
190 |
+
####################################
|
191 |
+
with gr.Row():
|
192 |
+
with gr.Column(scale=1, min_width=220):
|
193 |
+
gr.Image("logo.png", show_label=False, interactive=False, width=220, height=220)
|
194 |
+
with gr.Column(scale=5):
|
195 |
+
gr.Markdown("## H4rmony Eval")
|
196 |
+
gr.Markdown(
|
197 |
+
"- The evaluation can be requested by CSV or by single Prompt/completion.\n"
|
198 |
+
|
199 |
+
"- The CSV, if present, should have **both a 'question' and an 'answer'** column.\n\n"
|
200 |
+
|
201 |
+
"The judge model scores each Q&A on a **0–5** scale, and you'll see the final percentage o score."
|
202 |
+
)
|
203 |
+
|
204 |
+
####################################
|
205 |
+
# Middle row:
|
206 |
+
# 1) Upload CSV
|
207 |
+
# 2) Download Results
|
208 |
+
# 3) Score (big box)
|
209 |
+
####################################
|
210 |
+
with gr.Row(equal_height=True):
|
211 |
+
# Square #1: Upload CSV
|
212 |
+
with gr.Column(scale=1):
|
213 |
+
gr.Markdown("#### Upload CSV")
|
214 |
+
csv_in = gr.File(label="CSV File", type="filepath")
|
215 |
+
|
216 |
+
# Square #2: Download Results
|
217 |
+
with gr.Column(scale=1):
|
218 |
+
gr.Markdown("#### Download Results")
|
219 |
+
csv_out = gr.File(label="Scored CSV", interactive=False)
|
220 |
+
|
221 |
+
# Square #3: Score
|
222 |
+
with gr.Column(scale=1):
|
223 |
+
gr.Markdown("#### Score")
|
224 |
+
score_html = gr.HTML(
|
225 |
+
value="""
|
226 |
+
<div style="width:200px; height:200px; border:2px solid #333;
|
227 |
+
display:flex; align-items:center; justify-content:center; font-size:30px;">
|
228 |
+
--
|
229 |
+
</div>
|
230 |
+
""",
|
231 |
+
label="Final Score"
|
232 |
+
)
|
233 |
+
|
234 |
+
####################################
|
235 |
+
# Single Q&A
|
236 |
+
####################################
|
237 |
+
gr.Markdown(
|
238 |
+
"""
|
239 |
+
---
|
240 |
+
### Single Q&A Evaluation
|
241 |
+
Enter one question and one answer below, then click **Evaluate Single Q&A** to get a 0–5 score
|
242 |
+
in the same box on the right.
|
243 |
+
"""
|
244 |
+
)
|
245 |
+
|
246 |
+
with gr.Row():
|
247 |
+
single_q = gr.Textbox(
|
248 |
+
lines=3,
|
249 |
+
label="Single Question / Prompt"
|
250 |
+
)
|
251 |
+
single_a = gr.Textbox(
|
252 |
+
lines=3,
|
253 |
+
label="Single Answer"
|
254 |
+
)
|
255 |
+
|
256 |
+
def on_single_evaluate(q, a):
|
257 |
+
score = judge_answer(q, a)
|
258 |
+
# Show the numeric score in the same style as the CSV
|
259 |
+
box = f"""
|
260 |
+
<div style="width:200px; height:200px; border:2px solid #333;
|
261 |
+
display:flex; align-items:center; justify-content:center; font-size:30px;">
|
262 |
+
{score}
|
263 |
+
</div>
|
264 |
+
"""
|
265 |
+
return box
|
266 |
+
|
267 |
+
####################################
|
268 |
+
# Bottom row: Model + 2 Buttons (CSV & Single)
|
269 |
+
####################################
|
270 |
+
with gr.Row():
|
271 |
+
with gr.Column():
|
272 |
+
model_in = gr.Textbox(
|
273 |
+
label="Evaluated Model (WIP)",
|
274 |
+
value="---- Feature not yet available ---------"
|
275 |
+
)
|
276 |
+
|
277 |
+
# Two buttons side by side:
|
278 |
+
with gr.Row():
|
279 |
+
submit_btn = gr.Button("Submit CSV")
|
280 |
+
single_btn = gr.Button("Evaluate Single Q&A")
|
281 |
+
|
282 |
+
####################################
|
283 |
+
# Define both callbacks
|
284 |
+
####################################
|
285 |
+
def on_submit(csv_path, model_name):
|
286 |
+
box, out_path = run_evaluation(csv_path, model_name)
|
287 |
+
return box, out_path
|
288 |
+
|
289 |
+
# Linking the two callbacks:
|
290 |
+
# 1) CSV evaluation
|
291 |
+
submit_btn.click(
|
292 |
+
fn=on_submit,
|
293 |
+
inputs=[csv_in, model_in],
|
294 |
+
outputs=[score_html, csv_out]
|
295 |
+
)
|
296 |
+
# 2) Single Q&A evaluation
|
297 |
+
single_btn.click(
|
298 |
+
fn=on_single_evaluate,
|
299 |
+
inputs=[single_q, single_a],
|
300 |
+
outputs=score_html
|
301 |
+
)
|
302 |
+
|
303 |
+
demo.launch()
|