import os import re import time import csv import tempfile import requests import pandas as pd import gradio as gr ###################################### # Environment / Secrets ###################################### #OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") #if not OPENAI_API_KEY: # raise Exception("OPENAI_API_KEY not found in environment variables. Please add it as a secret in your Space.") COHERE_API_KEY = os.environ.get("COHERE_API_KEY") if not COHERE_API_KEY: raise Exception("COHERE_API_KEY not found in environment variables. Please add it as a secret in your Space.") HF_API_TOKEN = os.environ.get("HF_TOKEN") hf_headers = {} if HF_API_TOKEN: hf_headers = {"Authorization": f"Bearer {HF_API_TOKEN}"} ###################################### # Load System Instructions ###################################### with open("system_instructions.txt", "r", encoding="utf-8") as f: system_instructions = f.read() ###################################### # Helper Functions ###################################### def call_judge(prompt: str, max_tokens=200, temperature=0.7) -> str: """ Calls judge via Chat Completion API and returns the model's text output. """ url = "https://api.cohere.ai/v1/generate" headers = { "Authorization": f"Bearer {COHERE_API_KEY}", "Content-Type": "application/json" } payload = { "model": "command-r-plus", # Adjust based on the desired Cohere model "prompt": prompt, "max_tokens": max_tokens, "temperature": temperature } response = requests.post(url, json=payload, headers=headers) if response.status_code != 200: raise Exception(f"Cohere API error: {response.text}") result = response.json() return result["generations"][0]["text"] def call_hf(model: str, prompt: str, max_new_tokens=200, max_retries=10, delay=5) -> str: """ Calls a Hugging Face Inference endpoint for text generation. Retries if the model is still loading. """ api_url = f"https://api-inference.huggingface.co/models/{model}" payload = { "inputs": prompt, "parameters": { "do_sample": False, "max_new_tokens": max_new_tokens } } for attempt in range(max_retries): resp = requests.post(api_url, json=payload, headers=hf_headers) data = resp.json() if isinstance(data, dict) and data.get("error"): if "loading" in data["error"].lower(): print(f"Attempt {attempt+1}/{max_retries}: Model is loading. Retrying in {delay} seconds...") time.sleep(delay) else: raise Exception(f"Error from model {model}: {data['error']}") else: # Data should be a list like [{ "generated_text": "..." }] return data[0]["generated_text"] raise Exception(f"Model {model} is still loading after {max_retries} attempts.") def generate_answer(question: str, evaluated_model: str) -> str: """ Generates an answer for the question, using the specified evaluated model or 'placeholder' if empty. """ if evaluated_model.strip().lower() == "please enter model to evaluate": return f"Placeholder answer for: {question}" else: return call_hf(evaluated_model, question) def judge_answer(question: str, answer: str) -> int: """ Sends question+answer to the judge with system instructions to produce a numeric score (0 to 5). """ prompt = ( f"{system_instructions}\n\n" f"Question: {question}\n" f"Answer: {answer}\n\n" "Please provide a score from 0 to 5, where 5 is perfect and 0 is entirely incorrect. " "Provide only the numeric score in your response." ) output = call_judge(prompt, max_tokens=200, temperature=0.7) match = re.search(r"\b([0-5])\b", output) if match: return int(match.group(1)) return 0 ###################################### # Main Evaluation ###################################### def evaluate_csv(csv_file, evaluated_model_name): """ Reads a CSV with a 'question' and a 'answer' column. Scores each Q&A with the judge model (0..5). Returns (avg_score_percent, csv_temp_path). """ df = pd.read_csv(csv_file) if "question" not in df.columns: raise ValueError("CSV must contain a 'question' column.") has_answer_col = ("answer" in df.columns) results = [] for _, row in df.iterrows(): q = str(row["question"]) if has_answer_col: a = str(row["answer"]) else: a = generate_answer(q, evaluated_model_name) score = judge_answer(q, a) results.append({"question": q, "answer": a, "score": score}) if len(results) == 0: return 0.0, None total_score = sum(item["score"] for item in results) max_possible = len(results) * 5 avg_score_percent = (total_score / max_possible) * 100 # Build output CSV (comma-separated) out_df = pd.DataFrame(results) csv_str = out_df.to_csv( index=False, sep=',', # Comma separated quotechar='"', quoting=csv.QUOTE_ALL, encoding='utf-8-sig' ) with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv", encoding="utf-8-sig") as tmp_file: tmp_file.write(csv_str) tmp_file_path = tmp_file.name return avg_score_percent, tmp_file_path def run_evaluation(csv_file, evaluated_model_name): """ Gradio callback: 1) Evaluates Q&A from the CSV. 2) Returns a big box with % and a downloadable CSV. """ avg_percentage, csv_path = evaluate_csv(csv_file, evaluated_model_name) # Build the same style box as the single Q&A will use score_box = f"""
{avg_percentage:.2f}%
""" return score_box, csv_path ###################################### # Gradio Interface ###################################### with gr.Blocks() as demo: #################################### # Top row: Logo (left), Title + instructions (right) #################################### with gr.Row(): with gr.Column(scale=1, min_width=220): gr.Image("logo.png", show_label=False, interactive=False, width=220, height=220) with gr.Column(scale=5): gr.Markdown("## H4rmony Eval") gr.Markdown( "- The evaluation can be requested by CSV or by single Prompt/completion.\n" "- The CSV, if present, should have **both a 'question' and an 'answer'** column.\n\n" "The judge model scores each Q&A on a **0–5** scale, and you'll see the final percentage o score." ) #################################### # Middle row: # 1) Upload CSV # 2) Download Results # 3) Score (big box) #################################### with gr.Row(equal_height=True): # Square #1: Upload CSV with gr.Column(scale=1): gr.Markdown("#### Upload CSV") csv_in = gr.File(label="CSV File", type="filepath") # Square #2: Download Results with gr.Column(scale=1): gr.Markdown("#### Download Results") csv_out = gr.File(label="Scored CSV", interactive=False) # Square #3: Score with gr.Column(scale=1): gr.Markdown("#### Score") score_html = gr.HTML( value="""
--
""", label="Final Score" ) #################################### # Single Q&A #################################### gr.Markdown( """ --- ### Single Q&A Evaluation Enter one question and one answer below, then click **Evaluate Single Q&A** to get a 0–5 score in the same box on the right. """ ) with gr.Row(): single_q = gr.Textbox( lines=3, label="Single Question / Prompt" ) single_a = gr.Textbox( lines=3, label="Single Answer" ) def on_single_evaluate(q, a): score = judge_answer(q, a) # Show the numeric score in the same style as the CSV box = f"""
{score}
""" return box #################################### # Bottom row: Model + 2 Buttons (CSV & Single) #################################### with gr.Row(): with gr.Column(): model_in = gr.Textbox( label="Evaluated Model (WIP)", value="---- Feature not yet available ---------" ) # Two buttons side by side: with gr.Row(): submit_btn = gr.Button("Submit CSV") single_btn = gr.Button("Evaluate Single Q&A") #################################### # Define both callbacks #################################### def on_submit(csv_path, model_name): box, out_path = run_evaluation(csv_path, model_name) return box, out_path # Linking the two callbacks: # 1) CSV evaluation submit_btn.click( fn=on_submit, inputs=[csv_in, model_in], outputs=[score_html, csv_out] ) # 2) Single Q&A evaluation single_btn.click( fn=on_single_evaluate, inputs=[single_q, single_a], outputs=score_html ) demo.launch()