Spaces:

neovalle
/

H4rmonyEval

Running

App Files Files Community

neovalle commited on Apr 27

Commit

88c675c

verified ·

1 Parent(s): a1bfe90

Create app.py

Browse files

Files changed (1) hide show

app.py +303 -0

app.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import os
+import re
+import time
+import csv
+import tempfile
+import requests
+import pandas as pd
+import gradio as gr
+######################################
+# Environment / Secrets
+######################################
+#OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+#if not OPENAI_API_KEY:
+#   raise Exception("OPENAI_API_KEY not found in environment variables. Please add it as a secret in your Space.")
+from google.colab import userdata
+#COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
+COHERE_API_KEY = userdata.get('cohere_key')
+if not COHERE_API_KEY:
+    raise Exception("COHERE_API_KEY not found in environment variables. Please add it as a secret in your Space.")
+#HF_API_TOKEN = os.environ.get("HF_TOKEN")
+HF_API_TOKEN = userdata.get('hf_token')
+hf_headers = {}
+if HF_API_TOKEN:
+    hf_headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
+######################################
+# Load System Instructions
+######################################
+with open("system_instructions.txt", "r", encoding="utf-8") as f:
+    system_instructions = f.read()
+######################################
+# Helper Functions
+######################################
+def call_judge(prompt: str, max_tokens=200, temperature=0.7) -> str:
+    """
+    Calls judge via Chat Completion API
+    and returns the model's text output.
+    """
+    url = "https://api.cohere.ai/v1/generate"
+    headers = {
+        "Authorization": f"Bearer {COHERE_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "model": "command-r-plus",  # Adjust based on the desired Cohere model
+        "prompt": prompt,
+        "max_tokens": max_tokens,
+        "temperature": temperature
+    }
+    response = requests.post(url, json=payload, headers=headers)
+    if response.status_code != 200:
+        raise Exception(f"Cohere API error: {response.text}")
+    result = response.json()
+    return result["generations"][0]["text"]
+def call_hf(model: str, prompt: str, max_new_tokens=200, max_retries=10, delay=5) -> str:
+    """
+    Calls a Hugging Face Inference endpoint for text generation.
+    Retries if the model is still loading.
+    """
+    api_url = f"https://api-inference.huggingface.co/models/{model}"
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "do_sample": False,
+            "max_new_tokens": max_new_tokens
+        }
+    }
+    for attempt in range(max_retries):
+        resp = requests.post(api_url, json=payload, headers=hf_headers)
+        data = resp.json()
+        if isinstance(data, dict) and data.get("error"):
+            if "loading" in data["error"].lower():
+                print(f"Attempt {attempt+1}/{max_retries}: Model is loading. Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                raise Exception(f"Error from model {model}: {data['error']}")
+        else:
+            # Data should be a list like [{ "generated_text": "..." }]
+            return data[0]["generated_text"]
+    raise Exception(f"Model {model} is still loading after {max_retries} attempts.")
+def generate_answer(question: str, evaluated_model: str) -> str:
+    """
+    Generates an answer for the question, using the specified evaluated model or 'placeholder' if empty.
+    """
+    if evaluated_model.strip().lower() == "please enter model to evaluate":
+        return f"Placeholder answer for: {question}"
+    else:
+        return call_hf(evaluated_model, question)
+def judge_answer(question: str, answer: str) -> int:
+    """
+    Sends question+answer to the judge with system instructions to produce a numeric score (0 to 5).
+    """
+    prompt = (
+        f"{system_instructions}\n\n"
+        f"Question: {question}\n"
+        f"Answer: {answer}\n\n"
+        "Please provide a score from 0 to 5, where 5 is perfect and 0 is entirely incorrect. "
+        "Provide only the numeric score in your response."
+    )
+    output = call_judge(prompt, max_tokens=200, temperature=0.7)
+    match = re.search(r"\b([0-5])\b", output)
+    if match:
+        return int(match.group(1))
+    return 0
+######################################
+# Main Evaluation
+######################################
+def evaluate_csv(csv_file, evaluated_model_name):
+    """
+    Reads a CSV with a 'question' and a 'answer' column.
+    Scores each Q&A with the judge model (0..5).
+    Returns (avg_score_percent, csv_temp_path).
+    """
+    df = pd.read_csv(csv_file)
+    if "question" not in df.columns:
+        raise ValueError("CSV must contain a 'question' column.")
+    has_answer_col = ("answer" in df.columns)
+    results = []
+    for _, row in df.iterrows():
+        q = str(row["question"])
+        if has_answer_col:
+            a = str(row["answer"])
+        else:
+            a = generate_answer(q, evaluated_model_name)
+        score = judge_answer(q, a)
+        results.append({"question": q, "answer": a, "score": score})
+    if len(results) == 0:
+        return 0.0, None
+    total_score = sum(item["score"] for item in results)
+    max_possible = len(results) * 5
+    avg_score_percent = (total_score / max_possible) * 100
+    # Build output CSV (comma-separated)
+    out_df = pd.DataFrame(results)
+    csv_str = out_df.to_csv(
+        index=False,
+        sep=',',                 # Comma separated
+        quotechar='"',
+        quoting=csv.QUOTE_ALL,
+        encoding='utf-8-sig'
+    )
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv", encoding="utf-8-sig") as tmp_file:
+        tmp_file.write(csv_str)
+        tmp_file_path = tmp_file.name
+    return avg_score_percent, tmp_file_path
+def run_evaluation(csv_file, evaluated_model_name):
+    """
+    Gradio callback:
+      1) Evaluates Q&A from the CSV.
+      2) Returns a big box with % and a downloadable CSV.
+    """
+    avg_percentage, csv_path = evaluate_csv(csv_file, evaluated_model_name)
+    # Build the same style box as the single Q&A will use
+    score_box = f"""
+    <div style="width:200px; height:200px; border:2px solid #333;
+                display:flex; align-items:center; justify-content:center; font-size:30px;">
+      {avg_percentage:.2f}%
+    </div>
+    """
+    return score_box, csv_path
+######################################
+# Gradio Interface
+######################################
+with gr.Blocks() as demo:
+    ####################################
+    # Top row: Logo (left), Title + instructions (right)
+    ####################################
+    with gr.Row():
+        with gr.Column(scale=1, min_width=220):
+            gr.Image("logo.png", show_label=False, interactive=False, width=220, height=220)
+        with gr.Column(scale=5):
+            gr.Markdown("## H4rmony Eval")
+            gr.Markdown(
+                "- The evaluation can be requested by CSV or by single Prompt/completion.\n"
+                "- The CSV, if present, should have **both a 'question' and an 'answer'** column.\n\n"
+                "The judge model scores each Q&A on a **0–5** scale, and you'll see the final percentage o score."
+            )
+    ####################################
+    # Middle row:
+    #   1) Upload CSV
+    #   2) Download Results
+    #   3) Score (big box)
+    ####################################
+    with gr.Row(equal_height=True):
+        # Square #1: Upload CSV
+        with gr.Column(scale=1):
+            gr.Markdown("#### Upload CSV")
+            csv_in = gr.File(label="CSV File", type="filepath")
+        # Square #2: Download Results
+        with gr.Column(scale=1):
+            gr.Markdown("#### Download Results")
+            csv_out = gr.File(label="Scored CSV", interactive=False)
+        # Square #3: Score
+        with gr.Column(scale=1):
+            gr.Markdown("#### Score")
+            score_html = gr.HTML(
+                value="""
+                <div style="width:200px; height:200px; border:2px solid #333;
+                            display:flex; align-items:center; justify-content:center; font-size:30px;">
+                  --
+                </div>
+                """,
+                label="Final Score"
+            )
+    ####################################
+    # Single Q&A
+    ####################################
+    gr.Markdown(
+        """
+        ---
+        ### Single Q&A Evaluation
+        Enter one question and one answer below, then click **Evaluate Single Q&A** to get a 0–5 score
+        in the same box on the right.
+        """
+    )
+    with gr.Row():
+        single_q = gr.Textbox(
+            lines=3,
+            label="Single Question / Prompt"
+        )
+        single_a = gr.Textbox(
+            lines=3,
+            label="Single Answer"
+        )
+    def on_single_evaluate(q, a):
+        score = judge_answer(q, a)
+        # Show the numeric score in the same style as the CSV
+        box = f"""
+        <div style="width:200px; height:200px; border:2px solid #333;
+                    display:flex; align-items:center; justify-content:center; font-size:30px;">
+          {score}
+        </div>
+        """
+        return box
+    ####################################
+    # Bottom row: Model + 2 Buttons (CSV & Single)
+    ####################################
+    with gr.Row():
+        with gr.Column():
+            model_in = gr.Textbox(
+                label="Evaluated Model (WIP)",
+                value="---- Feature not yet available ---------"
+            )
+            # Two buttons side by side:
+            with gr.Row():
+                submit_btn = gr.Button("Submit CSV")
+                single_btn = gr.Button("Evaluate Single Q&A")
+    ####################################
+    # Define both callbacks
+    ####################################
+    def on_submit(csv_path, model_name):
+        box, out_path = run_evaluation(csv_path, model_name)
+        return box, out_path
+    # Linking the two callbacks:
+    #  1) CSV evaluation
+    submit_btn.click(
+        fn=on_submit,
+        inputs=[csv_in, model_in],
+        outputs=[score_html, csv_out]
+    )
+    #  2) Single Q&A evaluation
+    single_btn.click(
+        fn=on_single_evaluate,
+        inputs=[single_q, single_a],
+        outputs=score_html
+    )
+demo.launch()