import os import re import time import csv import tempfile import requests import pandas as pd import gradio as gr ###################################### # Environment / Secrets ###################################### #OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") #if not OPENAI_API_KEY: # raise Exception("OPENAI_API_KEY not found in environment variables. Please add it as a secret in your Space.") COHERE_API_KEY = os.environ.get("COHERE_API_KEY") if not COHERE_API_KEY: raise Exception("COHERE_API_KEY not found in environment variables. Please add it as a secret in your Space.") HF_API_TOKEN = os.environ.get("HF_TOKEN") hf_headers = {} if HF_API_TOKEN: hf_headers = {"Authorization": f"Bearer {HF_API_TOKEN}"} ###################################### # Load System Instructions ###################################### with open("system_instructions.txt", "r", encoding="utf-8") as f: system_instructions = f.read() ###################################### # Helper Functions ###################################### def call_judge(prompt: str, max_tokens=200, temperature=0.7) -> str: """ Calls judge via Chat Completion API and returns the model's text output. """ url = "https://api.cohere.ai/v1/generate" headers = { "Authorization": f"Bearer {COHERE_API_KEY}", "Content-Type": "application/json" } payload = { "model": "command-r-plus", # Adjust based on the desired Cohere model "prompt": prompt, "max_tokens": max_tokens, "temperature": temperature } response = requests.post(url, json=payload, headers=headers) if response.status_code != 200: raise Exception(f"Cohere API error: {response.text}") result = response.json() return result["generations"][0]["text"] def call_hf(model: str, prompt: str, max_new_tokens=200, max_retries=10, delay=5) -> str: """ Calls a Hugging Face Inference endpoint for text generation. Retries if the model is still loading. """ api_url = f"https://api-inference.huggingface.co/models/{model}" payload = { "inputs": prompt, "parameters": { "do_sample": False, "max_new_tokens": max_new_tokens } } for attempt in range(max_retries): resp = requests.post(api_url, json=payload, headers=hf_headers) data = resp.json() if isinstance(data, dict) and data.get("error"): if "loading" in data["error"].lower(): print(f"Attempt {attempt+1}/{max_retries}: Model is loading. Retrying in {delay} seconds...") time.sleep(delay) else: raise Exception(f"Error from model {model}: {data['error']}") else: # Data should be a list like [{ "generated_text": "..." }] return data[0]["generated_text"] raise Exception(f"Model {model} is still loading after {max_retries} attempts.") def generate_answer(question: str, evaluated_model: str) -> str: """ Generates an answer for the question, using the specified evaluated model or 'placeholder' if empty. """ if evaluated_model.strip().lower() == "please enter model to evaluate": return f"Placeholder answer for: {question}" else: return call_hf(evaluated_model, question) def judge_answer(question: str, answer: str) -> int: """ Sends question+answer to the judge with system instructions to produce a numeric score (0 to 5). """ prompt = ( f"{system_instructions}\n\n" f"Question: {question}\n" f"Answer: {answer}\n\n" "Please provide a score from 0 to 5, where 5 is perfect and 0 is entirely incorrect. " "Provide only the numeric score in your response." ) output = call_judge(prompt, max_tokens=200, temperature=0.7) match = re.search(r"\b([0-5])\b", output) if match: return int(match.group(1)) return 0 ###################################### # Main Evaluation ###################################### def evaluate_csv(csv_file, evaluated_model_name): """ Reads a CSV with a 'question' and a 'answer' column. Scores each Q&A with the judge model (0..5). Returns (avg_score_percent, csv_temp_path). """ df = pd.read_csv(csv_file) if "question" not in df.columns: raise ValueError("CSV must contain a 'question' column.") has_answer_col = ("answer" in df.columns) results = [] for _, row in df.iterrows(): q = str(row["question"]) if has_answer_col: a = str(row["answer"]) else: a = generate_answer(q, evaluated_model_name) score = judge_answer(q, a) results.append({"question": q, "answer": a, "score": score}) if len(results) == 0: return 0.0, None total_score = sum(item["score"] for item in results) max_possible = len(results) * 5 avg_score_percent = (total_score / max_possible) * 100 # Build output CSV (comma-separated) out_df = pd.DataFrame(results) csv_str = out_df.to_csv( index=False, sep=',', # Comma separated quotechar='"', quoting=csv.QUOTE_ALL, encoding='utf-8-sig' ) with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv", encoding="utf-8-sig") as tmp_file: tmp_file.write(csv_str) tmp_file_path = tmp_file.name return avg_score_percent, tmp_file_path def run_evaluation(csv_file, evaluated_model_name): """ Gradio callback: 1) Evaluates Q&A from the CSV. 2) Returns a big box with % and a downloadable CSV. """ avg_percentage, csv_path = evaluate_csv(csv_file, evaluated_model_name) # Build the same style box as the single Q&A will use score_box = f"""