Spaces:

mharkey
/

test

Runtime error

App Files Files Community

mharkey commited on Jun 14

Commit

d1599df

verified ·

1 Parent(s): 060f632

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -47

app.py CHANGED Viewed

@@ -1,57 +1,90 @@
 import os
-import gradio as gr
 from datasets import load_dataset
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
-import random
-# Use HF token from environment (set in Hugging Face Space secrets)
-hf_token = os.environ.get("HF_TOKEN")
-# Load dataset once (train split only)
-gta = load_dataset("Jize1/GTA", split="train", use_auth_token=hf_token)
-# Pick 5 queries for simplicity
-sample_queries = random.sample(list(gta), 5)
-# Metric simulation logic (placeholder)
 def evaluate_model(model_name):
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
-        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", trust_remote_code=True, use_auth_token=hf_token)
-        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-        inst_acc = round(random.uniform(30, 80), 2)
-        tool_acc = round(random.uniform(10, 70), 2)
-        summ_acc = round(random.uniform(40, 90), 2)
-        output_rows = []
-        for q in sample_queries:
-            user_input = next(d['content'] for d in q['dialogs'] if d['role'] == "user")
-            toolnames = [t["name"] for t in q["tools"]]
-            output_rows.append({
-                "Query": user_input[:80] + "...",
-                "Tools": ", ".join(toolnames),
-                "Prediction": pipe(user_input, max_new_tokens=64)[0]["generated_text"]
-            })
-        return f"""
-        ✅ Evaluation Metrics:
-        - Instruction Accuracy: {inst_acc}%
-        - Tool Selection Accuracy: {tool_acc}%
-        - Summary Accuracy: {summ_acc}%
-        """, output_rows
-    except Exception as e:
-        return f"❌ Error loading model or generating output: {e}", []
-# Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("## 🛠 GTA Benchmark Simulator (Hugging Face Model)")
-    model_input = gr.Textbox(label="Enter Hugging Face model name", placeholder="e.g., Qwen/Qwen2.5-3B")
-    run_btn = gr.Button("Run Evaluation")
-    results = gr.Textbox(label="Evaluation Results")
-    table = gr.Dataframe(headers=["Query", "Tools", "Prediction"], wrap=True)
-    run_btn.click(fn=evaluate_model, inputs=model_input, outputs=[results, table])
-demo.launch()

 import os
+import requests
+from huggingface_hub import login, hf_hub_url
 from datasets import load_dataset
+from PIL import Image
+from io import BytesIO
+import gradio as gr
+from transformers import pipeline
+# Authenticate using HF token
+login(token=os.environ["HF_TOKEN"])
+# Helper to resolve image path
+def resolve_image_url(path):
+    return hf_hub_url(repo_id="Jize1/GTA", filename=path, repo_type="dataset")
+# Download image from HF hub with token
+def download_image(url):
+    headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"}
+    response = requests.get(url, headers=headers)
+    image = Image.open(BytesIO(response.content)).convert("RGB")
+    return image
+# Load GTA dataset
+print("Loading GTA dataset...")
+gta_data = load_dataset("Jize1/GTA", split="train", use_auth_token=True)
+# Load image captioning and OCR pipelines
+print("Loading vision models...")
+image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+ocr_pipeline = pipeline("image-classification", model="microsoft/dit-base-finetuned-iiit5k")  # placeholder OCR
 def evaluate_model(model_name):
+    total = 0
+    inst_acc = 0
+    tool_acc = 0
+    summ_acc = 0
+    for example in gta_data.select(range(10)):  # limit to 10 for demo
+        dialogs = example["dialogs"]
+        gt_answer = example["gt_answer"]
+        user_query = dialogs[0]["content"]
+        files = example["files"]
+        tool_calls = [d for d in dialogs if d.get("tool_calls")]
+        image_path = files[0]["path"]
+        image_url = resolve_image_url(image_path)
+        image = download_image(image_url)
+        # Fake tool execution: use captioner/ocr based on tool type
+        result = ""
+        for tool_call in tool_calls:
+            tool = tool_call["tool_calls"][0]["function"]["name"]
+            if tool == "ImageDescription":
+                caption = image_captioner(image)[0]["generated_text"]
+                result += f"[Caption] {caption}\n"
+            elif tool == "OCR":
+                result += f"[OCR] dummy OCR result for {image_path}\n"
+            elif tool == "CountGivenObject":
+                result += f"[Count] dummy count result\n"
+        # Simulate metrics
+        inst_acc += 1
+        tool_acc += 1 if len(tool_calls) > 0 else 0
+        summ_acc += 1 if gt_answer["whitelist"] else 0
+        total += 1
+    return {
+        "InstAcc": round(inst_acc / total * 100, 2),
+        "ToolAcc": round(tool_acc / total * 100, 2),
+        "SummAcc": round(summ_acc / total * 100, 2)
+    }
+def run_evaluation(model_name):
+    results = evaluate_model(model_name)
+    return f"Results for {model_name}:\n" + "\n".join(f"{k}: {v}%" for k, v in results.items())
+# Gradio UI
+demo = gr.Interface(
+    fn=run_evaluation,
+    inputs=gr.Textbox(label="Hugging Face Model Name", placeholder="e.g. Qwen/Qwen2.5-3B"),
+    outputs=gr.Textbox(label="GTA Evaluation Metrics"),
+    title="GTA LLM Evaluation",
+    description="Enter a model name from Hugging Face to simulate tool use and get GTA-style metrics.",
+    allow_flagging="never"
+)
+demo.launch()