GAIA_Agent

Sleeping

App Files Files Community

ArturoNereu commited on Jun 5

Commit

5800fed

1 Parent(s): a168d8d

answer caching implemented via hf datasets

Browse files

Files changed (2) hide show

app.py +129 -36
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ import requests
 import inspect
 import pandas as pd
 import json
 from gaia_agent import GaiaAgent
 # (Keep Constants as is)
@@ -13,32 +15,108 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # To check if we are running locally
 running_on_hf = bool(os.getenv("SPACE_ID") or os.getenv("SPACE_HOST"))
-# Cache file for storing correct answers
-CACHE_FILE = "answers_cache.json"
 def load_answers_cache():
-    """Load cached answers from file"""
     try:
-        if os.path.exists(CACHE_FILE):
-            with open(CACHE_FILE, 'r') as f:
-                return json.load(f)
     except Exception as e:
-        print(f"Error loading cache: {e}")
-    return {}
 def save_answers_cache(cache):
-    """Save cached answers to file"""
     try:
-        with open(CACHE_FILE, 'w') as f:
-            json.dump(cache, f, indent=2)
         return True
     except Exception as e:
-        print(f"Error saving cache: {e}")
         return False
 def run_and_cache_answers(profile: gr.OAuthProfile | None):
     """
-    Runs agent on questions and caches correct answers for later submission
     """
     if not running_on_hf:
         return "Caching only available on HuggingFace Spaces", None
@@ -64,15 +142,14 @@ def run_and_cache_answers(profile: gr.OAuthProfile | None):
     except Exception as e:
         return f"Error fetching questions: {e}", None
-    # 3. Load existing cache
     cache = load_answers_cache()
-    # 4. Run agent on solvable questions
     results_log = []
-    solvable_indices = [0, 2, 4]  # Focus on proven questions
-    new_answers = 0
-    for idx in solvable_indices:
         if idx >= len(questions_data):
             continue
@@ -83,13 +160,13 @@ def run_and_cache_answers(profile: gr.OAuthProfile | None):
         if not task_id or question_text is None:
             continue
-        # Skip if already cached
         if task_id in cache:
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text[:100] + "...",
                 "Answer": cache[task_id],
-                "Status": "CACHED"
             })
             continue
@@ -97,15 +174,17 @@ def run_and_cache_answers(profile: gr.OAuthProfile | None):
             print(f"Processing question {idx+1}: {question_text[:100]}...")
             submitted_answer = agent(question_text)
-            # Cache the answer (we'll validate it later)
-            cache[task_id] = submitted_answer
-            new_answers += 1
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text[:100] + "...",
                 "Answer": submitted_answer,
-                "Status": "NEW"
             })
         except Exception as e:
@@ -113,17 +192,34 @@ def run_and_cache_answers(profile: gr.OAuthProfile | None):
                 "Task ID": task_id,
                 "Question": question_text[:100] + "...",
                 "Answer": f"ERROR: {e}",
-                "Status": "FAILED"
             })
-    # 5. Save updated cache
-    if new_answers > 0:
-        if save_answers_cache(cache):
-            status = f"✅ Processed {len(solvable_indices)} questions. Added {new_answers} new answers to cache."
         else:
-            status = f"⚠️ Generated {new_answers} answers but failed to save cache."
     else:
-        status = "All target questions already cached."
     return status, pd.DataFrame(results_log)
@@ -241,11 +337,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     results_log = []
     answers_payload = []
-    # Focus on the 3 questions we know work correctly
-    solvable_indices = [0, 2, 4]  # Mercedes Sosa, Reversed text, Dinosaur Featured Article
-    print(f"Running agent on {len(solvable_indices)} solvable questions...")
-    for idx in solvable_indices:
         if idx >= len(questions_data):
             continue
         item = questions_data[idx]

 import inspect
 import pandas as pd
 import json
+from datasets import Dataset
+from huggingface_hub import HfApi
 from gaia_agent import GaiaAgent
 # (Keep Constants as is)
 # To check if we are running locally
 running_on_hf = bool(os.getenv("SPACE_ID") or os.getenv("SPACE_HOST"))
+# Questions the agent can reliably solve (no images, audio, video)
+SOLVABLE_INDICES = [0, 2, 4]  # Mercedes Sosa, Reversed text, Dinosaur Featured Article
+def get_dataset_name():
+    """Get the private dataset name for this space"""
+    space_id = os.getenv("SPACE_ID")
+    if space_id:
+        return f"{space_id.replace('/', '--')}-gaia-answers"
+    return "gaia-answers-cache"
 def load_answers_cache():
+    """Load cached answers from HuggingFace Dataset"""
+    if not running_on_hf:
+        return {}
     try:
+        dataset_name = get_dataset_name()
+        dataset = Dataset.load_from_hub(dataset_name, split="train")
+        # Convert back to dictionary
+        cache = {}
+        if len(dataset) > 0:
+            for item in dataset:
+                cache[item["task_id"]] = item["answer"]
+        print(f"✅ Loaded {len(cache)} cached answers from dataset: {dataset_name}")
+        return cache
     except Exception as e:
+        print(f"📝 No existing cache found (will create new): {e}")
+        return {}
 def save_answers_cache(cache):
+    """Save cached answers to HuggingFace Dataset"""
+    if not running_on_hf or not cache:
+        return False
     try:
+        dataset_name = get_dataset_name()
+        # Convert dictionary to dataset format
+        data = {
+            "task_id": list(cache.keys()),
+            "answer": list(cache.values())
+        }
+        dataset = Dataset.from_dict(data)
+        dataset.push_to_hub(dataset_name, private=True)
+        print(f"💾 Saved {len(cache)} answers to private dataset: {dataset_name}")
         return True
     except Exception as e:
+        print(f"Error saving cache to dataset: {e}")
         return False
+def check_answers_correctness(answers_payload, questions_data):
+    """
+    Submit answers to get correctness feedback and return which ones were correct
+    """
+    if not running_on_hf:
+        return {}
+    try:
+        # Prepare minimal submission for validation
+        space_id = os.getenv("SPACE_ID")
+        agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+        submission_data = {
+            "username": "validation_check",
+            "agent_code": agent_code,
+            "answers": answers_payload
+        }
+        api_url = DEFAULT_API_URL
+        submit_url = f"{api_url}/submit"
+        response = requests.post(submit_url, json=submission_data, timeout=60)
+        response.raise_for_status()
+        result_data = response.json()
+        # Parse which answers were correct
+        correct_answers = {}
+        if "detailed_results" in result_data:
+            for result in result_data["detailed_results"]:
+                if result.get("correct", False):
+                    task_id = result.get("task_id")
+                    # Find the corresponding answer
+                    for answer in answers_payload:
+                        if answer["task_id"] == task_id:
+                            correct_answers[task_id] = answer["submitted_answer"]
+                            break
+        return correct_answers
+    except Exception as e:
+        print(f"Error checking answer correctness: {e}")
+        return {}
 def run_and_cache_answers(profile: gr.OAuthProfile | None):
     """
+    Runs agent on questions, validates answers, and caches only correct ones
     """
     if not running_on_hf:
         return "Caching only available on HuggingFace Spaces", None
     except Exception as e:
         return f"Error fetching questions: {e}", None
+    # 3. Load existing cache (verified correct answers)
     cache = load_answers_cache()
+    # 4. Run agent only on unsolved questions
     results_log = []
+    new_answers_payload = []
+    for idx in SOLVABLE_INDICES:
         if idx >= len(questions_data):
             continue
         if not task_id or question_text is None:
             continue
+        # Skip if already have correct answer cached
         if task_id in cache:
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text[:100] + "...",
                 "Answer": cache[task_id],
+                "Status": "✅ CORRECT (CACHED)"
             })
             continue
             print(f"Processing question {idx+1}: {question_text[:100]}...")
             submitted_answer = agent(question_text)
+            # Add to payload for validation
+            new_answers_payload.append({
+                "task_id": task_id,
+                "submitted_answer": submitted_answer
+            })
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text[:100] + "...",
                 "Answer": submitted_answer,
+                "Status": "🔄 VALIDATING..."
             })
         except Exception as e:
                 "Task ID": task_id,
                 "Question": question_text[:100] + "...",
                 "Answer": f"ERROR: {e}",
+                "Status": "❌ FAILED"
             })
+    # 5. Validate new answers and cache only correct ones
+    if new_answers_payload:
+        print(f"🔍 Validating {len(new_answers_payload)} new answers...")
+        correct_answers = check_answers_correctness(new_answers_payload, questions_data)
+        # Update cache with only correct answers
+        cache.update(correct_answers)
+        # Update results log with validation results
+        for log_entry in results_log:
+            if log_entry["Status"] == "🔄 VALIDATING...":
+                task_id = log_entry["Task ID"]
+                if task_id in correct_answers:
+                    log_entry["Status"] = "✅ CORRECT (NEW)"
+                else:
+                    log_entry["Status"] = "❌ INCORRECT"
+        # Save updated cache
+        if correct_answers:
+            save_answers_cache(cache)
+            status = f"🎉 Validated {len(new_answers_payload)} answers. Cached {len(correct_answers)} correct answers!"
         else:
+            status = f"😔 Validated {len(new_answers_payload)} answers. None were correct this time."
     else:
+        status = "All target questions already have correct answers cached!"
     return status, pd.DataFrame(results_log)
     results_log = []
     answers_payload = []
+    print(f"Running agent on {len(SOLVABLE_INDICES)} solvable questions...")
+    for idx in SOLVABLE_INDICES:
         if idx >= len(questions_data):
             continue
         item = questions_data[idx]

requirements.txt CHANGED Viewed

@@ -3,4 +3,6 @@ requests
 smolagents
 duckduckgo-search
 openai
-wikipedia

 smolagents
 duckduckgo-search
 openai
+wikipedia
+datasets
+huggingface_hub