Spaces:

neovalle
/

H4rmonyEval

Running

File size: 10,012 Bytes

import os
import re
import time
import csv
import tempfile
import requests
import pandas as pd
import gradio as gr

######################################
# Environment / Secrets
######################################

#OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
#if not OPENAI_API_KEY:
#   raise Exception("OPENAI_API_KEY not found in environment variables. Please add it as a secret in your Space.")


COHERE_API_KEY = os.environ.get("COHERE_API_KEY")


if not COHERE_API_KEY:
    raise Exception("COHERE_API_KEY not found in environment variables. Please add it as a secret in your Space.")

HF_API_TOKEN = os.environ.get("HF_TOKEN")

hf_headers = {}
if HF_API_TOKEN:
    hf_headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}

######################################
# Load System Instructions 
######################################

with open("system_instructions.txt", "r", encoding="utf-8") as f:
    system_instructions = f.read()

######################################
# Helper Functions
######################################

def call_judge(prompt: str, max_tokens=200, temperature=0.7) -> str:
    """
    Calls judge via Chat Completion API 
    and returns the model's text output.
    """
    url = "https://api.cohere.ai/v1/generate"
    headers = {
        "Authorization": f"Bearer {COHERE_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "command-r-plus",  # Adjust based on the desired Cohere model
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature
    }
    
    response = requests.post(url, json=payload, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Cohere API error: {response.text}")
    result = response.json()
    return result["generations"][0]["text"]

def call_hf(model: str, prompt: str, max_new_tokens=200, max_retries=10, delay=5) -> str:
    """
    Calls a Hugging Face Inference endpoint for text generation.
    Retries if the model is still loading.
    """
    api_url = f"https://api-inference.huggingface.co/models/{model}"
    payload = {
        "inputs": prompt,
        "parameters": {
            "do_sample": False,
            "max_new_tokens": max_new_tokens
        }
    }

    for attempt in range(max_retries):
        resp = requests.post(api_url, json=payload, headers=hf_headers)
        data = resp.json()
        if isinstance(data, dict) and data.get("error"):
            if "loading" in data["error"].lower():
                print(f"Attempt {attempt+1}/{max_retries}: Model is loading. Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                raise Exception(f"Error from model {model}: {data['error']}")
        else:
            # Data should be a list like [{ "generated_text": "..." }]
            return data[0]["generated_text"]
    raise Exception(f"Model {model} is still loading after {max_retries} attempts.")

def generate_answer(question: str, evaluated_model: str) -> str:
    """
    Generates an answer for the question, using the specified evaluated model or 'placeholder' if empty.
    """
    if evaluated_model.strip().lower() == "please enter model to evaluate":
        return f"Placeholder answer for: {question}"
    else:
        return call_hf(evaluated_model, question)

def judge_answer(question: str, answer: str) -> int:
    """
    Sends question+answer to the judge with system instructions to produce a numeric score (0 to 5).
    """
    prompt = (
        f"{system_instructions}\n\n"
        f"Question: {question}\n"
        f"Answer: {answer}\n\n"
        "Please provide a score from 0 to 5, where 5 is perfect and 0 is entirely incorrect. "
        "Provide only the numeric score in your response."
    )
    output = call_judge(prompt, max_tokens=200, temperature=0.7)
    match = re.search(r"\b([0-5])\b", output)
    if match:
        return int(match.group(1))
    return 0

######################################
# Main Evaluation
######################################

def evaluate_csv(csv_file, evaluated_model_name):
    """
    Reads a CSV with a 'question' and a 'answer' column.
    Scores each Q&A with the judge model (0..5).
    Returns (avg_score_percent, csv_temp_path).
    """
    df = pd.read_csv(csv_file)
    if "question" not in df.columns:
        raise ValueError("CSV must contain a 'question' column.")

    has_answer_col = ("answer" in df.columns)
    results = []
    for _, row in df.iterrows():
        q = str(row["question"])
        if has_answer_col:
            a = str(row["answer"])
        else:
            a = generate_answer(q, evaluated_model_name)
        score = judge_answer(q, a)
        results.append({"question": q, "answer": a, "score": score})

    if len(results) == 0:
        return 0.0, None

    total_score = sum(item["score"] for item in results)
    max_possible = len(results) * 5
    avg_score_percent = (total_score / max_possible) * 100

    # Build output CSV (comma-separated)
    out_df = pd.DataFrame(results)
    csv_str = out_df.to_csv(
        index=False,
        sep=',',                 # Comma separated
        quotechar='"',
        quoting=csv.QUOTE_ALL,
        encoding='utf-8-sig'
    )
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv", encoding="utf-8-sig") as tmp_file:
        tmp_file.write(csv_str)
        tmp_file_path = tmp_file.name

    return avg_score_percent, tmp_file_path

def run_evaluation(csv_file, evaluated_model_name):
    """
    Gradio callback:
      1) Evaluates Q&A from the CSV.
      2) Returns a big box with % and a downloadable CSV.
    """
    avg_percentage, csv_path = evaluate_csv(csv_file, evaluated_model_name)
    # Build the same style box as the single Q&A will use
    score_box = f"""
    <div style="width:200px; height:200px; border:2px solid #333;
                display:flex; align-items:center; justify-content:center; font-size:30px;">
      {avg_percentage:.2f}%
    </div>
    """
    return score_box, csv_path

######################################
# Gradio Interface
######################################

with gr.Blocks() as demo:
    ####################################
    # Top row: Logo (left), Title + instructions (right)
    ####################################
    with gr.Row():
        with gr.Column(scale=1, min_width=220):
            gr.Image("logo.png", show_label=False, interactive=False, width=220, height=220)
        with gr.Column(scale=5):
            gr.Markdown("## H4rmony Eval")
            gr.Markdown(
                "- The evaluation can be requested by CSV or by single Prompt/completion.\n"

                "- The CSV, if present, should have **both a 'question' and an 'answer'** column.\n\n"
                
                "The judge model scores each Q&A on a **0–5** scale, and you'll see the final percentage o score."
            )

    ####################################
    # Middle row: 
    #   1) Upload CSV
    #   2) Download Results
    #   3) Score (big box)
    ####################################
    with gr.Row(equal_height=True):
        # Square #1: Upload CSV
        with gr.Column(scale=1):
            gr.Markdown("#### Upload CSV")
            csv_in = gr.File(label="CSV File", type="filepath")

        # Square #2: Download Results
        with gr.Column(scale=1):
            gr.Markdown("#### Download Results")
            csv_out = gr.File(label="Scored CSV", interactive=False)

        # Square #3: Score
        with gr.Column(scale=1):
            gr.Markdown("#### Score")
            score_html = gr.HTML(
                value="""
                <div style="width:200px; height:200px; border:2px solid #333;
                            display:flex; align-items:center; justify-content:center; font-size:30px;">
                  --
                </div>
                """,
                label="Final Score"
            )

    ####################################
    # Single Q&A 
    ####################################
    gr.Markdown(
        """
        ---
        ### Single Q&A Evaluation
        Enter one question and one answer below, then click **Evaluate Single Q&A** to get a 0–5 score 
        in the same box on the right.
        """
    )

    with gr.Row():
        single_q = gr.Textbox(
            lines=3,
            label="Single Question / Prompt"
        )
        single_a = gr.Textbox(
            lines=3,
            label="Single Answer"
        )

    def on_single_evaluate(q, a):
        score = judge_answer(q, a)
        # Show the numeric score in the same style as the CSV
        box = f"""
        <div style="width:200px; height:200px; border:2px solid #333;
                    display:flex; align-items:center; justify-content:center; font-size:30px;">
          {score}
        </div>
        """
        return box

    ####################################
    # Bottom row: Model + 2 Buttons (CSV & Single)
    ####################################
    with gr.Row():
        with gr.Column():
            model_in = gr.Textbox(
                label="Evaluated Model (WIP)",
                value="---- Feature not yet available ---------"
            )

            # Two buttons side by side:
            with gr.Row():
                submit_btn = gr.Button("Submit CSV")
                single_btn = gr.Button("Evaluate Single Q&A")

    ####################################
    # Define both callbacks
    ####################################
    def on_submit(csv_path, model_name):
        box, out_path = run_evaluation(csv_path, model_name)
        return box, out_path

    # Linking the two callbacks:
    #  1) CSV evaluation
    submit_btn.click(
        fn=on_submit,
        inputs=[csv_in, model_in],
        outputs=[score_html, csv_out]
    )
    #  2) Single Q&A evaluation
    single_btn.click(
        fn=on_single_evaluate,
        inputs=[single_q, single_a],
        outputs=score_html
    )

demo.launch()