Spaces:
Sleeping
Sleeping
import os | |
import re | |
import time | |
import csv | |
import tempfile | |
import requests | |
import pandas as pd | |
import gradio as gr | |
###################################### | |
# Environment / Secrets | |
###################################### | |
#OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") | |
#if not OPENAI_API_KEY: | |
# raise Exception("OPENAI_API_KEY not found in environment variables. Please add it as a secret in your Space.") | |
COHERE_API_KEY = os.environ.get("COHERE_API_KEY") | |
if not COHERE_API_KEY: | |
raise Exception("COHERE_API_KEY not found in environment variables. Please add it as a secret in your Space.") | |
HF_API_TOKEN = os.environ.get("HF_TOKEN") | |
hf_headers = {} | |
if HF_API_TOKEN: | |
hf_headers = {"Authorization": f"Bearer {HF_API_TOKEN}"} | |
###################################### | |
# Load System Instructions | |
###################################### | |
with open("system_instructions.txt", "r", encoding="utf-8") as f: | |
system_instructions = f.read() | |
###################################### | |
# Helper Functions | |
###################################### | |
def call_judge(prompt: str, max_tokens=200, temperature=0.7) -> str: | |
""" | |
Calls judge via Chat Completion API | |
and returns the model's text output. | |
""" | |
url = "https://api.cohere.ai/v1/generate" | |
headers = { | |
"Authorization": f"Bearer {COHERE_API_KEY}", | |
"Content-Type": "application/json" | |
} | |
payload = { | |
"model": "command-r-plus", # Adjust based on the desired Cohere model | |
"prompt": prompt, | |
"max_tokens": max_tokens, | |
"temperature": temperature | |
} | |
response = requests.post(url, json=payload, headers=headers) | |
if response.status_code != 200: | |
raise Exception(f"Cohere API error: {response.text}") | |
result = response.json() | |
return result["generations"][0]["text"] | |
def call_hf(model: str, prompt: str, max_new_tokens=200, max_retries=10, delay=5) -> str: | |
""" | |
Calls a Hugging Face Inference endpoint for text generation. | |
Retries if the model is still loading. | |
""" | |
api_url = f"https://api-inference.huggingface.co/models/{model}" | |
payload = { | |
"inputs": prompt, | |
"parameters": { | |
"do_sample": False, | |
"max_new_tokens": max_new_tokens | |
} | |
} | |
for attempt in range(max_retries): | |
resp = requests.post(api_url, json=payload, headers=hf_headers) | |
data = resp.json() | |
if isinstance(data, dict) and data.get("error"): | |
if "loading" in data["error"].lower(): | |
print(f"Attempt {attempt+1}/{max_retries}: Model is loading. Retrying in {delay} seconds...") | |
time.sleep(delay) | |
else: | |
raise Exception(f"Error from model {model}: {data['error']}") | |
else: | |
# Data should be a list like [{ "generated_text": "..." }] | |
return data[0]["generated_text"] | |
raise Exception(f"Model {model} is still loading after {max_retries} attempts.") | |
def generate_answer(question: str, evaluated_model: str) -> str: | |
""" | |
Generates an answer for the question, using the specified evaluated model or 'placeholder' if empty. | |
""" | |
if evaluated_model.strip().lower() == "please enter model to evaluate": | |
return f"Placeholder answer for: {question}" | |
else: | |
return call_hf(evaluated_model, question) | |
def judge_answer(question: str, answer: str) -> int: | |
""" | |
Sends question+answer to the judge with system instructions to produce a numeric score (0 to 5). | |
""" | |
prompt = ( | |
f"{system_instructions}\n\n" | |
f"Question: {question}\n" | |
f"Answer: {answer}\n\n" | |
"Please provide a score from 0 to 5, where 5 is perfect and 0 is entirely incorrect. " | |
"Provide only the numeric score in your response." | |
) | |
output = call_judge(prompt, max_tokens=200, temperature=0.7) | |
match = re.search(r"\b([0-5])\b", output) | |
if match: | |
return int(match.group(1)) | |
return 0 | |
###################################### | |
# Main Evaluation | |
###################################### | |
def evaluate_csv(csv_file, evaluated_model_name): | |
""" | |
Reads a CSV with a 'question' and a 'answer' column. | |
Scores each Q&A with the judge model (0..5). | |
Returns (avg_score_percent, csv_temp_path). | |
""" | |
df = pd.read_csv(csv_file) | |
if "question" not in df.columns: | |
raise ValueError("CSV must contain a 'question' column.") | |
has_answer_col = ("answer" in df.columns) | |
results = [] | |
for _, row in df.iterrows(): | |
q = str(row["question"]) | |
if has_answer_col: | |
a = str(row["answer"]) | |
else: | |
a = generate_answer(q, evaluated_model_name) | |
score = judge_answer(q, a) | |
results.append({"question": q, "answer": a, "score": score}) | |
if len(results) == 0: | |
return 0.0, None | |
total_score = sum(item["score"] for item in results) | |
max_possible = len(results) * 5 | |
avg_score_percent = (total_score / max_possible) * 100 | |
# Build output CSV (comma-separated) | |
out_df = pd.DataFrame(results) | |
csv_str = out_df.to_csv( | |
index=False, | |
sep=',', # Comma separated | |
quotechar='"', | |
quoting=csv.QUOTE_ALL, | |
encoding='utf-8-sig' | |
) | |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv", encoding="utf-8-sig") as tmp_file: | |
tmp_file.write(csv_str) | |
tmp_file_path = tmp_file.name | |
return avg_score_percent, tmp_file_path | |
def run_evaluation(csv_file, evaluated_model_name): | |
""" | |
Gradio callback: | |
1) Evaluates Q&A from the CSV. | |
2) Returns a big box with % and a downloadable CSV. | |
""" | |
avg_percentage, csv_path = evaluate_csv(csv_file, evaluated_model_name) | |
# Build the same style box as the single Q&A will use | |
score_box = f""" | |
<div style="width:200px; height:200px; border:2px solid #333; | |
display:flex; align-items:center; justify-content:center; font-size:30px;"> | |
{avg_percentage:.2f}% | |
</div> | |
""" | |
return score_box, csv_path | |
###################################### | |
# Gradio Interface | |
###################################### | |
with gr.Blocks() as demo: | |
#################################### | |
# Top row: Logo (left), Title + instructions (right) | |
#################################### | |
with gr.Row(): | |
with gr.Column(scale=1, min_width=220): | |
gr.Image("logo.png", show_label=False, interactive=False, width=220, height=220) | |
with gr.Column(scale=5): | |
gr.Markdown("## H4rmony Eval") | |
gr.Markdown( | |
"- The evaluation can be requested by CSV or by single Prompt/completion.\n" | |
"- The CSV, if present, should have **both a 'question' and an 'answer'** column.\n\n" | |
"The judge model scores each Q&A on a **0β5** scale, and you'll see the final percentage o score." | |
) | |
#################################### | |
# Middle row: | |
# 1) Upload CSV | |
# 2) Download Results | |
# 3) Score (big box) | |
#################################### | |
with gr.Row(equal_height=True): | |
# Square #1: Upload CSV | |
with gr.Column(scale=1): | |
gr.Markdown("#### Upload CSV") | |
csv_in = gr.File(label="CSV File", type="filepath") | |
# Square #2: Download Results | |
with gr.Column(scale=1): | |
gr.Markdown("#### Download Results") | |
csv_out = gr.File(label="Scored CSV", interactive=False) | |
# Square #3: Score | |
with gr.Column(scale=1): | |
gr.Markdown("#### Score") | |
score_html = gr.HTML( | |
value=""" | |
<div style="width:200px; height:200px; border:2px solid #333; | |
display:flex; align-items:center; justify-content:center; font-size:30px;"> | |
-- | |
</div> | |
""", | |
label="Final Score" | |
) | |
#################################### | |
# Single Q&A | |
#################################### | |
gr.Markdown( | |
""" | |
--- | |
### Single Q&A Evaluation | |
Enter one question and one answer below, then click **Evaluate Single Q&A** to get a 0β5 score | |
in the same box on the right. | |
""" | |
) | |
with gr.Row(): | |
single_q = gr.Textbox( | |
lines=3, | |
label="Single Question / Prompt" | |
) | |
single_a = gr.Textbox( | |
lines=3, | |
label="Single Answer" | |
) | |
def on_single_evaluate(q, a): | |
score = judge_answer(q, a) | |
# Show the numeric score in the same style as the CSV | |
box = f""" | |
<div style="width:200px; height:200px; border:2px solid #333; | |
display:flex; align-items:center; justify-content:center; font-size:30px;"> | |
{score} | |
</div> | |
""" | |
return box | |
#################################### | |
# Bottom row: Model + 2 Buttons (CSV & Single) | |
#################################### | |
with gr.Row(): | |
with gr.Column(): | |
model_in = gr.Textbox( | |
label="Evaluated Model (WIP)", | |
value="---- Feature not yet available ---------" | |
) | |
# Two buttons side by side: | |
with gr.Row(): | |
submit_btn = gr.Button("Submit CSV") | |
single_btn = gr.Button("Evaluate Single Q&A") | |
#################################### | |
# Define both callbacks | |
#################################### | |
def on_submit(csv_path, model_name): | |
box, out_path = run_evaluation(csv_path, model_name) | |
return box, out_path | |
# Linking the two callbacks: | |
# 1) CSV evaluation | |
submit_btn.click( | |
fn=on_submit, | |
inputs=[csv_in, model_in], | |
outputs=[score_html, csv_out] | |
) | |
# 2) Single Q&A evaluation | |
single_btn.click( | |
fn=on_single_evaluate, | |
inputs=[single_q, single_a], | |
outputs=score_html | |
) | |
demo.launch() |