Spaces:

mharkey
/

test

Runtime error

App Files Files Community

mharkey commited on Jun 14

Commit

060f632

verified ·

1 Parent(s): 2c7a7bc

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -61

app.py CHANGED Viewed

@@ -1,76 +1,57 @@
 import gradio as gr
-from transformers import pipeline
 from datasets import load_dataset
-import torch
-gta = load_dataset("Jize1/GTA", split="train")
-def evaluate_model(model_name, num_samples):
-    try:
-        pipe = pipeline("text-generation", model=model_name, device=0 if torch.cuda.is_available() else -1)
-        inst_correct, tool_correct, summ_correct, ans_correct = 0, 0, 0, 0
-        logs = []
-        for i in range(min(num_samples, len(gta))):
-            sample = gta[i]
-            query = sample["dialogs"][0]["content"]
-            tools_used = [step["function"]["name"].lower() for step in sample["dialogs"] if "function" in step.get("function", {})]
-            prediction = pipe(query, max_new_tokens=256, do_sample=False)[0]["generated_text"].strip().lower()
-            # Instruction following: if answer is long enough and not hallucinated
-            inst_pass = len(prediction) > 10 and any(w in prediction for w in ["use", "calculate", "looks like", "means", "based on"])
-            inst_correct += inst_pass
-            # ToolAcc: if any known tool name is mentioned
-            tool_pass = any(tool in prediction for tool in tools_used)
-            tool_correct += tool_pass
-            # SummAcc: if answer includes concluding phrases or numbers (as proxy)
-            summ_pass = any(x in prediction for x in ["so", "therefore", "the answer is", "equals", "you will need", "hence"])
-            summ_correct += summ_pass
-            # AnsAcc: match whitelist phrase
-            gt_phrases = sample["gt_answer"].get("whitelist", [])
-            flat_gt = {s.strip().lower() for group in gt_phrases for s in group if isinstance(s, str)}
-            ans_pass = any(g in prediction for g in flat_gt)
-            ans_correct += ans_pass
-            logs.append(f"""
-### Query {i}
-**Input**: {query}
-**Prediction**: {prediction}
-**GT**: {flat_gt}
-**Instruction✔️**: {inst_pass}
-**Tool✔️**: {tool_pass}
-**Summary✔️**: {summ_pass}
-**Answer✔️**: {ans_pass}
----""")
-        total = min(num_samples, len(gta))
-        results = {
-            "InstAcc": round((inst_correct / total) * 100, 2),
-            "ToolAcc": round((tool_correct / total) * 100, 2),
-            "SummAcc": round((summ_correct / total) * 100, 2),
-            "AnsAcc": round((ans_correct / total) * 100, 2),
-        }
-        summary = "\n".join([f"**{k}**: {v}%" for k, v in results.items()])
-        return f"## 🔬 GTA Evaluation for `{model_name}` on {total} queries\n\n{summary}\n\n---\n" + "\n".join(logs)
     except Exception as e:
-        return f"❌ Error: {e}"
 # Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("# 🧠 GTA Tool Use Evaluation (Real Metrics, Real Queries)")
-    with gr.Row():
-        model_input = gr.Textbox(label="Model Name", value="Qwen/Qwen2.5-3B")
-        sample_slider = gr.Slider(label="Number of GTA samples", minimum=1, maximum=229, value=10, step=1)
     run_btn = gr.Button("Run Evaluation")
-    output_md = gr.Markdown()
-    run_btn.click(fn=evaluate_model, inputs=[model_input, sample_slider], outputs=output_md)
 demo.launch()

+import os
 import gradio as gr
 from datasets import load_dataset
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+import random
+# Use HF token from environment (set in Hugging Face Space secrets)
+hf_token = os.environ.get("HF_TOKEN")
+# Load dataset once (train split only)
+gta = load_dataset("Jize1/GTA", split="train", use_auth_token=hf_token)
+# Pick 5 queries for simplicity
+sample_queries = random.sample(list(gta), 5)
+# Metric simulation logic (placeholder)
+def evaluate_model(model_name):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
+        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", trust_remote_code=True, use_auth_token=hf_token)
+        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+        inst_acc = round(random.uniform(30, 80), 2)
+        tool_acc = round(random.uniform(10, 70), 2)
+        summ_acc = round(random.uniform(40, 90), 2)
+        output_rows = []
+        for q in sample_queries:
+            user_input = next(d['content'] for d in q['dialogs'] if d['role'] == "user")
+            toolnames = [t["name"] for t in q["tools"]]
+            output_rows.append({
+                "Query": user_input[:80] + "...",
+                "Tools": ", ".join(toolnames),
+                "Prediction": pipe(user_input, max_new_tokens=64)[0]["generated_text"]
+            })
+        return f"""
+        ✅ Evaluation Metrics:
+        - Instruction Accuracy: {inst_acc}%
+        - Tool Selection Accuracy: {tool_acc}%
+        - Summary Accuracy: {summ_acc}%
+        """, output_rows
     except Exception as e:
+        return f"❌ Error loading model or generating output: {e}", []
 # Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("## 🛠 GTA Benchmark Simulator (Hugging Face Model)")
+    model_input = gr.Textbox(label="Enter Hugging Face model name", placeholder="e.g., Qwen/Qwen2.5-3B")
     run_btn = gr.Button("Run Evaluation")
+    results = gr.Textbox(label="Evaluation Results")
+    table = gr.Dataframe(headers=["Query", "Tools", "Prediction"], wrap=True)
+    run_btn.click(fn=evaluate_model, inputs=model_input, outputs=[results, table])
 demo.launch()