neovalle commited on
Commit
88c675c
·
verified ·
1 Parent(s): a1bfe90

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +303 -0
app.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import csv
5
+ import tempfile
6
+ import requests
7
+ import pandas as pd
8
+ import gradio as gr
9
+
10
+ ######################################
11
+ # Environment / Secrets
12
+ ######################################
13
+
14
+ #OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
15
+ #if not OPENAI_API_KEY:
16
+ # raise Exception("OPENAI_API_KEY not found in environment variables. Please add it as a secret in your Space.")
17
+
18
+ from google.colab import userdata
19
+
20
+ #COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
21
+ COHERE_API_KEY = userdata.get('cohere_key')
22
+
23
+ if not COHERE_API_KEY:
24
+ raise Exception("COHERE_API_KEY not found in environment variables. Please add it as a secret in your Space.")
25
+
26
+ #HF_API_TOKEN = os.environ.get("HF_TOKEN")
27
+ HF_API_TOKEN = userdata.get('hf_token')
28
+ hf_headers = {}
29
+ if HF_API_TOKEN:
30
+ hf_headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
31
+
32
+ ######################################
33
+ # Load System Instructions
34
+ ######################################
35
+
36
+ with open("system_instructions.txt", "r", encoding="utf-8") as f:
37
+ system_instructions = f.read()
38
+
39
+ ######################################
40
+ # Helper Functions
41
+ ######################################
42
+
43
+ def call_judge(prompt: str, max_tokens=200, temperature=0.7) -> str:
44
+ """
45
+ Calls judge via Chat Completion API
46
+ and returns the model's text output.
47
+ """
48
+ url = "https://api.cohere.ai/v1/generate"
49
+ headers = {
50
+ "Authorization": f"Bearer {COHERE_API_KEY}",
51
+ "Content-Type": "application/json"
52
+ }
53
+ payload = {
54
+ "model": "command-r-plus", # Adjust based on the desired Cohere model
55
+ "prompt": prompt,
56
+ "max_tokens": max_tokens,
57
+ "temperature": temperature
58
+ }
59
+
60
+ response = requests.post(url, json=payload, headers=headers)
61
+ if response.status_code != 200:
62
+ raise Exception(f"Cohere API error: {response.text}")
63
+ result = response.json()
64
+ return result["generations"][0]["text"]
65
+
66
+ def call_hf(model: str, prompt: str, max_new_tokens=200, max_retries=10, delay=5) -> str:
67
+ """
68
+ Calls a Hugging Face Inference endpoint for text generation.
69
+ Retries if the model is still loading.
70
+ """
71
+ api_url = f"https://api-inference.huggingface.co/models/{model}"
72
+ payload = {
73
+ "inputs": prompt,
74
+ "parameters": {
75
+ "do_sample": False,
76
+ "max_new_tokens": max_new_tokens
77
+ }
78
+ }
79
+
80
+ for attempt in range(max_retries):
81
+ resp = requests.post(api_url, json=payload, headers=hf_headers)
82
+ data = resp.json()
83
+ if isinstance(data, dict) and data.get("error"):
84
+ if "loading" in data["error"].lower():
85
+ print(f"Attempt {attempt+1}/{max_retries}: Model is loading. Retrying in {delay} seconds...")
86
+ time.sleep(delay)
87
+ else:
88
+ raise Exception(f"Error from model {model}: {data['error']}")
89
+ else:
90
+ # Data should be a list like [{ "generated_text": "..." }]
91
+ return data[0]["generated_text"]
92
+ raise Exception(f"Model {model} is still loading after {max_retries} attempts.")
93
+
94
+ def generate_answer(question: str, evaluated_model: str) -> str:
95
+ """
96
+ Generates an answer for the question, using the specified evaluated model or 'placeholder' if empty.
97
+ """
98
+ if evaluated_model.strip().lower() == "please enter model to evaluate":
99
+ return f"Placeholder answer for: {question}"
100
+ else:
101
+ return call_hf(evaluated_model, question)
102
+
103
+ def judge_answer(question: str, answer: str) -> int:
104
+ """
105
+ Sends question+answer to the judge with system instructions to produce a numeric score (0 to 5).
106
+ """
107
+ prompt = (
108
+ f"{system_instructions}\n\n"
109
+ f"Question: {question}\n"
110
+ f"Answer: {answer}\n\n"
111
+ "Please provide a score from 0 to 5, where 5 is perfect and 0 is entirely incorrect. "
112
+ "Provide only the numeric score in your response."
113
+ )
114
+ output = call_judge(prompt, max_tokens=200, temperature=0.7)
115
+ match = re.search(r"\b([0-5])\b", output)
116
+ if match:
117
+ return int(match.group(1))
118
+ return 0
119
+
120
+ ######################################
121
+ # Main Evaluation
122
+ ######################################
123
+
124
+ def evaluate_csv(csv_file, evaluated_model_name):
125
+ """
126
+ Reads a CSV with a 'question' and a 'answer' column.
127
+ Scores each Q&A with the judge model (0..5).
128
+ Returns (avg_score_percent, csv_temp_path).
129
+ """
130
+ df = pd.read_csv(csv_file)
131
+ if "question" not in df.columns:
132
+ raise ValueError("CSV must contain a 'question' column.")
133
+
134
+ has_answer_col = ("answer" in df.columns)
135
+ results = []
136
+ for _, row in df.iterrows():
137
+ q = str(row["question"])
138
+ if has_answer_col:
139
+ a = str(row["answer"])
140
+ else:
141
+ a = generate_answer(q, evaluated_model_name)
142
+ score = judge_answer(q, a)
143
+ results.append({"question": q, "answer": a, "score": score})
144
+
145
+ if len(results) == 0:
146
+ return 0.0, None
147
+
148
+ total_score = sum(item["score"] for item in results)
149
+ max_possible = len(results) * 5
150
+ avg_score_percent = (total_score / max_possible) * 100
151
+
152
+ # Build output CSV (comma-separated)
153
+ out_df = pd.DataFrame(results)
154
+ csv_str = out_df.to_csv(
155
+ index=False,
156
+ sep=',', # Comma separated
157
+ quotechar='"',
158
+ quoting=csv.QUOTE_ALL,
159
+ encoding='utf-8-sig'
160
+ )
161
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv", encoding="utf-8-sig") as tmp_file:
162
+ tmp_file.write(csv_str)
163
+ tmp_file_path = tmp_file.name
164
+
165
+ return avg_score_percent, tmp_file_path
166
+
167
+ def run_evaluation(csv_file, evaluated_model_name):
168
+ """
169
+ Gradio callback:
170
+ 1) Evaluates Q&A from the CSV.
171
+ 2) Returns a big box with % and a downloadable CSV.
172
+ """
173
+ avg_percentage, csv_path = evaluate_csv(csv_file, evaluated_model_name)
174
+ # Build the same style box as the single Q&A will use
175
+ score_box = f"""
176
+ <div style="width:200px; height:200px; border:2px solid #333;
177
+ display:flex; align-items:center; justify-content:center; font-size:30px;">
178
+ {avg_percentage:.2f}%
179
+ </div>
180
+ """
181
+ return score_box, csv_path
182
+
183
+ ######################################
184
+ # Gradio Interface
185
+ ######################################
186
+
187
+ with gr.Blocks() as demo:
188
+ ####################################
189
+ # Top row: Logo (left), Title + instructions (right)
190
+ ####################################
191
+ with gr.Row():
192
+ with gr.Column(scale=1, min_width=220):
193
+ gr.Image("logo.png", show_label=False, interactive=False, width=220, height=220)
194
+ with gr.Column(scale=5):
195
+ gr.Markdown("## H4rmony Eval")
196
+ gr.Markdown(
197
+ "- The evaluation can be requested by CSV or by single Prompt/completion.\n"
198
+
199
+ "- The CSV, if present, should have **both a 'question' and an 'answer'** column.\n\n"
200
+
201
+ "The judge model scores each Q&A on a **0–5** scale, and you'll see the final percentage o score."
202
+ )
203
+
204
+ ####################################
205
+ # Middle row:
206
+ # 1) Upload CSV
207
+ # 2) Download Results
208
+ # 3) Score (big box)
209
+ ####################################
210
+ with gr.Row(equal_height=True):
211
+ # Square #1: Upload CSV
212
+ with gr.Column(scale=1):
213
+ gr.Markdown("#### Upload CSV")
214
+ csv_in = gr.File(label="CSV File", type="filepath")
215
+
216
+ # Square #2: Download Results
217
+ with gr.Column(scale=1):
218
+ gr.Markdown("#### Download Results")
219
+ csv_out = gr.File(label="Scored CSV", interactive=False)
220
+
221
+ # Square #3: Score
222
+ with gr.Column(scale=1):
223
+ gr.Markdown("#### Score")
224
+ score_html = gr.HTML(
225
+ value="""
226
+ <div style="width:200px; height:200px; border:2px solid #333;
227
+ display:flex; align-items:center; justify-content:center; font-size:30px;">
228
+ --
229
+ </div>
230
+ """,
231
+ label="Final Score"
232
+ )
233
+
234
+ ####################################
235
+ # Single Q&A
236
+ ####################################
237
+ gr.Markdown(
238
+ """
239
+ ---
240
+ ### Single Q&A Evaluation
241
+ Enter one question and one answer below, then click **Evaluate Single Q&A** to get a 0–5 score
242
+ in the same box on the right.
243
+ """
244
+ )
245
+
246
+ with gr.Row():
247
+ single_q = gr.Textbox(
248
+ lines=3,
249
+ label="Single Question / Prompt"
250
+ )
251
+ single_a = gr.Textbox(
252
+ lines=3,
253
+ label="Single Answer"
254
+ )
255
+
256
+ def on_single_evaluate(q, a):
257
+ score = judge_answer(q, a)
258
+ # Show the numeric score in the same style as the CSV
259
+ box = f"""
260
+ <div style="width:200px; height:200px; border:2px solid #333;
261
+ display:flex; align-items:center; justify-content:center; font-size:30px;">
262
+ {score}
263
+ </div>
264
+ """
265
+ return box
266
+
267
+ ####################################
268
+ # Bottom row: Model + 2 Buttons (CSV & Single)
269
+ ####################################
270
+ with gr.Row():
271
+ with gr.Column():
272
+ model_in = gr.Textbox(
273
+ label="Evaluated Model (WIP)",
274
+ value="---- Feature not yet available ---------"
275
+ )
276
+
277
+ # Two buttons side by side:
278
+ with gr.Row():
279
+ submit_btn = gr.Button("Submit CSV")
280
+ single_btn = gr.Button("Evaluate Single Q&A")
281
+
282
+ ####################################
283
+ # Define both callbacks
284
+ ####################################
285
+ def on_submit(csv_path, model_name):
286
+ box, out_path = run_evaluation(csv_path, model_name)
287
+ return box, out_path
288
+
289
+ # Linking the two callbacks:
290
+ # 1) CSV evaluation
291
+ submit_btn.click(
292
+ fn=on_submit,
293
+ inputs=[csv_in, model_in],
294
+ outputs=[score_html, csv_out]
295
+ )
296
+ # 2) Single Q&A evaluation
297
+ single_btn.click(
298
+ fn=on_single_evaluate,
299
+ inputs=[single_q, single_a],
300
+ outputs=score_html
301
+ )
302
+
303
+ demo.launch()