Spaces:

mharkey
/

test

Runtime error

App Files Files Community

mharkey commited on Jun 14

Commit

2c7a7bc

verified ·

1 Parent(s): 25b3bcb

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -23

app.py CHANGED Viewed

@@ -3,46 +3,74 @@ from transformers import pipeline
 from datasets import load_dataset
 import torch
-# Load GTA dataset
 gta = load_dataset("Jize1/GTA", split="train")
 def evaluate_model(model_name, num_samples):
     try:
         pipe = pipeline("text-generation", model=model_name, device=0 if torch.cuda.is_available() else -1)
-        correct = 0
-        total = 0
-        log = []
         for i in range(min(num_samples, len(gta))):
-            query = gta[i]["dialogs"][0]["content"]
-            gt_answers = gta[i]["gt_answer"].get("whitelist", [])
-            flat_gt = {ans.strip().lower() for group in gt_answers for ans in group if isinstance(ans, str)}
-            # Generate model output
-            out = pipe(query, max_new_tokens=128, do_sample=False)[0]["generated_text"].strip().lower()
-            # Match: exact substring match with any whitelist answer
-            matched = any(gt in out for gt in flat_gt)
-            log.append(f"### Query {i}\n**Input**: {query}\n**Prediction**: {out}\n**GT**: {flat_gt}\n**✔️ Correct**: {matched}\n")
-            correct += int(matched)
-            total += 1
-        acc = round((correct / total) * 100, 2)
-        summary = f"### 🔍 GTA Answer Accuracy (AnsAcc) for `{model_name}`: **{acc}%** on {total} queries\n\n---\n"
-        return summary + "\n".join(log)
     except Exception as e:
-        return f"❌ Evaluation failed: {e}"
 with gr.Blocks() as demo:
-    gr.Markdown("# 🧪 Real GTA Evaluation (Answer Accuracy Only)")
-    model_input = gr.Textbox(label="Enter Hugging Face Model Name", value="Qwen/Qwen2.5-3B")
-    sample_count = gr.Slider(label="Number of GTA samples to evaluate", minimum=1, maximum=229, value=10, step=1)
     output_md = gr.Markdown()
-    model_input.change(fn=evaluate_model, inputs=[model_input, sample_count], outputs=output_md)
-    sample_count.change(fn=evaluate_model, inputs=[model_input, sample_count], outputs=output_md)
 demo.launch()

 from datasets import load_dataset
 import torch
 gta = load_dataset("Jize1/GTA", split="train")
 def evaluate_model(model_name, num_samples):
     try:
         pipe = pipeline("text-generation", model=model_name, device=0 if torch.cuda.is_available() else -1)
+        inst_correct, tool_correct, summ_correct, ans_correct = 0, 0, 0, 0
+        logs = []
         for i in range(min(num_samples, len(gta))):
+            sample = gta[i]
+            query = sample["dialogs"][0]["content"]
+            tools_used = [step["function"]["name"].lower() for step in sample["dialogs"] if "function" in step.get("function", {})]
+            prediction = pipe(query, max_new_tokens=256, do_sample=False)[0]["generated_text"].strip().lower()
+            # Instruction following: if answer is long enough and not hallucinated
+            inst_pass = len(prediction) > 10 and any(w in prediction for w in ["use", "calculate", "looks like", "means", "based on"])
+            inst_correct += inst_pass
+            # ToolAcc: if any known tool name is mentioned
+            tool_pass = any(tool in prediction for tool in tools_used)
+            tool_correct += tool_pass
+            # SummAcc: if answer includes concluding phrases or numbers (as proxy)
+            summ_pass = any(x in prediction for x in ["so", "therefore", "the answer is", "equals", "you will need", "hence"])
+            summ_correct += summ_pass
+            # AnsAcc: match whitelist phrase
+            gt_phrases = sample["gt_answer"].get("whitelist", [])
+            flat_gt = {s.strip().lower() for group in gt_phrases for s in group if isinstance(s, str)}
+            ans_pass = any(g in prediction for g in flat_gt)
+            ans_correct += ans_pass
+            logs.append(f"""
+### Query {i}
+**Input**: {query}
+**Prediction**: {prediction}
+**GT**: {flat_gt}
+**Instruction✔️**: {inst_pass}
+**Tool✔️**: {tool_pass}
+**Summary✔️**: {summ_pass}
+**Answer✔️**: {ans_pass}
+---""")
+        total = min(num_samples, len(gta))
+        results = {
+            "InstAcc": round((inst_correct / total) * 100, 2),
+            "ToolAcc": round((tool_correct / total) * 100, 2),
+            "SummAcc": round((summ_correct / total) * 100, 2),
+            "AnsAcc": round((ans_correct / total) * 100, 2),
+        }
+        summary = "\n".join([f"**{k}**: {v}%" for k, v in results.items()])
+        return f"## 🔬 GTA Evaluation for `{model_name}` on {total} queries\n\n{summary}\n\n---\n" + "\n".join(logs)
     except Exception as e:
+        return f"❌ Error: {e}"
+# Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 GTA Tool Use Evaluation (Real Metrics, Real Queries)")
+    with gr.Row():
+        model_input = gr.Textbox(label="Model Name", value="Qwen/Qwen2.5-3B")
+        sample_slider = gr.Slider(label="Number of GTA samples", minimum=1, maximum=229, value=10, step=1)
+    run_btn = gr.Button("Run Evaluation")
     output_md = gr.Markdown()
+    run_btn.click(fn=evaluate_model, inputs=[model_input, sample_slider], outputs=output_md)
 demo.launch()