H4rmonyEval / app.py
neovalle's picture
Update app.py
9765e8e verified
import os
import re
import time
import csv
import tempfile
import requests
import pandas as pd
import gradio as gr
######################################
# Environment / Secrets
######################################
#OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
#if not OPENAI_API_KEY:
# raise Exception("OPENAI_API_KEY not found in environment variables. Please add it as a secret in your Space.")
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
if not COHERE_API_KEY:
raise Exception("COHERE_API_KEY not found in environment variables. Please add it as a secret in your Space.")
HF_API_TOKEN = os.environ.get("HF_TOKEN")
hf_headers = {}
if HF_API_TOKEN:
hf_headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
######################################
# Load System Instructions
######################################
with open("system_instructions.txt", "r", encoding="utf-8") as f:
system_instructions = f.read()
######################################
# Helper Functions
######################################
def call_judge(prompt: str, max_tokens=200, temperature=0.7) -> str:
"""
Calls judge via Chat Completion API
and returns the model's text output.
"""
url = "https://api.cohere.ai/v1/generate"
headers = {
"Authorization": f"Bearer {COHERE_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": "command-r-plus", # Adjust based on the desired Cohere model
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": temperature
}
response = requests.post(url, json=payload, headers=headers)
if response.status_code != 200:
raise Exception(f"Cohere API error: {response.text}")
result = response.json()
return result["generations"][0]["text"]
def call_hf(model: str, prompt: str, max_new_tokens=200, max_retries=10, delay=5) -> str:
"""
Calls a Hugging Face Inference endpoint for text generation.
Retries if the model is still loading.
"""
api_url = f"https://api-inference.huggingface.co/models/{model}"
payload = {
"inputs": prompt,
"parameters": {
"do_sample": False,
"max_new_tokens": max_new_tokens
}
}
for attempt in range(max_retries):
resp = requests.post(api_url, json=payload, headers=hf_headers)
data = resp.json()
if isinstance(data, dict) and data.get("error"):
if "loading" in data["error"].lower():
print(f"Attempt {attempt+1}/{max_retries}: Model is loading. Retrying in {delay} seconds...")
time.sleep(delay)
else:
raise Exception(f"Error from model {model}: {data['error']}")
else:
# Data should be a list like [{ "generated_text": "..." }]
return data[0]["generated_text"]
raise Exception(f"Model {model} is still loading after {max_retries} attempts.")
def generate_answer(question: str, evaluated_model: str) -> str:
"""
Generates an answer for the question, using the specified evaluated model or 'placeholder' if empty.
"""
if evaluated_model.strip().lower() == "please enter model to evaluate":
return f"Placeholder answer for: {question}"
else:
return call_hf(evaluated_model, question)
def judge_answer(question: str, answer: str) -> int:
"""
Sends question+answer to the judge with system instructions to produce a numeric score (0 to 5).
"""
prompt = (
f"{system_instructions}\n\n"
f"Question: {question}\n"
f"Answer: {answer}\n\n"
"Please provide a score from 0 to 5, where 5 is perfect and 0 is entirely incorrect. "
"Provide only the numeric score in your response."
)
output = call_judge(prompt, max_tokens=200, temperature=0.7)
match = re.search(r"\b([0-5])\b", output)
if match:
return int(match.group(1))
return 0
######################################
# Main Evaluation
######################################
def evaluate_csv(csv_file, evaluated_model_name):
"""
Reads a CSV with a 'question' and a 'answer' column.
Scores each Q&A with the judge model (0..5).
Returns (avg_score_percent, csv_temp_path).
"""
df = pd.read_csv(csv_file)
if "question" not in df.columns:
raise ValueError("CSV must contain a 'question' column.")
has_answer_col = ("answer" in df.columns)
results = []
for _, row in df.iterrows():
q = str(row["question"])
if has_answer_col:
a = str(row["answer"])
else:
a = generate_answer(q, evaluated_model_name)
score = judge_answer(q, a)
results.append({"question": q, "answer": a, "score": score})
if len(results) == 0:
return 0.0, None
total_score = sum(item["score"] for item in results)
max_possible = len(results) * 5
avg_score_percent = (total_score / max_possible) * 100
# Build output CSV (comma-separated)
out_df = pd.DataFrame(results)
csv_str = out_df.to_csv(
index=False,
sep=',', # Comma separated
quotechar='"',
quoting=csv.QUOTE_ALL,
encoding='utf-8-sig'
)
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv", encoding="utf-8-sig") as tmp_file:
tmp_file.write(csv_str)
tmp_file_path = tmp_file.name
return avg_score_percent, tmp_file_path
def run_evaluation(csv_file, evaluated_model_name):
"""
Gradio callback:
1) Evaluates Q&A from the CSV.
2) Returns a big box with % and a downloadable CSV.
"""
avg_percentage, csv_path = evaluate_csv(csv_file, evaluated_model_name)
# Build the same style box as the single Q&A will use
score_box = f"""
<div style="width:200px; height:200px; border:2px solid #333;
display:flex; align-items:center; justify-content:center; font-size:30px;">
{avg_percentage:.2f}%
</div>
"""
return score_box, csv_path
######################################
# Gradio Interface
######################################
with gr.Blocks() as demo:
####################################
# Top row: Logo (left), Title + instructions (right)
####################################
with gr.Row():
with gr.Column(scale=1, min_width=220):
gr.Image("logo.png", show_label=False, interactive=False, width=220, height=220)
with gr.Column(scale=5):
gr.Markdown("## H4rmony Eval")
gr.Markdown(
"- The evaluation can be requested by CSV or by single Prompt/completion.\n"
"- The CSV, if present, should have **both a 'question' and an 'answer'** column.\n\n"
"The judge model scores each Q&A on a **0–5** scale, and you'll see the final percentage o score."
)
####################################
# Middle row:
# 1) Upload CSV
# 2) Download Results
# 3) Score (big box)
####################################
with gr.Row(equal_height=True):
# Square #1: Upload CSV
with gr.Column(scale=1):
gr.Markdown("#### Upload CSV")
csv_in = gr.File(label="CSV File", type="filepath")
# Square #2: Download Results
with gr.Column(scale=1):
gr.Markdown("#### Download Results")
csv_out = gr.File(label="Scored CSV", interactive=False)
# Square #3: Score
with gr.Column(scale=1):
gr.Markdown("#### Score")
score_html = gr.HTML(
value="""
<div style="width:200px; height:200px; border:2px solid #333;
display:flex; align-items:center; justify-content:center; font-size:30px;">
--
</div>
""",
label="Final Score"
)
####################################
# Single Q&A
####################################
gr.Markdown(
"""
---
### Single Q&A Evaluation
Enter one question and one answer below, then click **Evaluate Single Q&A** to get a 0–5 score
in the same box on the right.
"""
)
with gr.Row():
single_q = gr.Textbox(
lines=3,
label="Single Question / Prompt"
)
single_a = gr.Textbox(
lines=3,
label="Single Answer"
)
def on_single_evaluate(q, a):
score = judge_answer(q, a)
# Show the numeric score in the same style as the CSV
box = f"""
<div style="width:200px; height:200px; border:2px solid #333;
display:flex; align-items:center; justify-content:center; font-size:30px;">
{score}
</div>
"""
return box
####################################
# Bottom row: Model + 2 Buttons (CSV & Single)
####################################
with gr.Row():
with gr.Column():
model_in = gr.Textbox(
label="Evaluated Model (WIP)",
value="---- Feature not yet available ---------"
)
# Two buttons side by side:
with gr.Row():
submit_btn = gr.Button("Submit CSV")
single_btn = gr.Button("Evaluate Single Q&A")
####################################
# Define both callbacks
####################################
def on_submit(csv_path, model_name):
box, out_path = run_evaluation(csv_path, model_name)
return box, out_path
# Linking the two callbacks:
# 1) CSV evaluation
submit_btn.click(
fn=on_submit,
inputs=[csv_in, model_in],
outputs=[score_html, csv_out]
)
# 2) Single Q&A evaluation
single_btn.click(
fn=on_single_evaluate,
inputs=[single_q, single_a],
outputs=score_html
)
demo.launch()