Spaces:

mozilla-ai
/

surf-spot-finder

Running

Nathan Brake commited on Mar 24

Commit

ef766f7

unverified ·

1 Parent(s): ffb4e87

The test case no longer specifies which agent is involved (#30)

* The test case no longer specifies which agent is involved

* format

Files changed (3) hide show

src/surf_spot_finder/evaluation/evaluate.py CHANGED Viewed

@@ -115,7 +115,9 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
     logger.info("<green>=====================================</green>")
-def evaluate(test_case_path: str, telemetry_path: Optional[str] = None) -> None:
     """
     Evaluate agent performance using either a provided telemetry file or by running the agent.
@@ -123,7 +125,9 @@ def evaluate(test_case_path: str, telemetry_path: Optional[str] = None) -> None:
         telemetry_path: Optional path to an existing telemetry file. If not provided,
                         the agent will be run to generate one.
     """
-    test_case = TestCase.from_yaml(test_case_path)
     if telemetry_path is None:
         logger.info(

     logger.info("<green>=====================================</green>")
+def evaluate(
+    test_case_path: str, agent_config_path: str, telemetry_path: Optional[str] = None
+) -> None:
     """
     Evaluate agent performance using either a provided telemetry file or by running the agent.
         telemetry_path: Optional path to an existing telemetry file. If not provided,
                         the agent will be run to generate one.
     """
+    test_case = TestCase.from_yaml(
+        test_case_path=test_case_path, agent_config_path=agent_config_path
+    )
     if telemetry_path is None:
         logger.info(

src/surf_spot_finder/evaluation/test_case.py CHANGED Viewed

@@ -15,7 +15,7 @@ class InputModel(BaseModel):
 class AgentModel(BaseModel):
     model_id: str
-    api_key_var: str
     api_base: Optional[str] = None
     agent_type: str
     tools: Optional[List[str]] = None
@@ -38,10 +38,14 @@ class TestCase(BaseModel):
     final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
     @classmethod
-    def from_yaml(cls, case_path: str) -> "TestCase":
         """Load a test case from a YAML file and process it"""
-        with open(case_path, "r") as f:
             test_case_dict = yaml.safe_load(f)
         final_answer_criteria = []
         def add_gt_final_answer_criteria(ground_truth_list):

 class AgentModel(BaseModel):
     model_id: str
+    api_key_var: str = "OPENAI_API_KEY"
     api_base: Optional[str] = None
     agent_type: str
     tools: Optional[List[str]] = None
     final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
     @classmethod
+    def from_yaml(cls, test_case_path: str, agent_config_path: str) -> "TestCase":
         """Load a test case from a YAML file and process it"""
+        with open(test_case_path, "r") as f:
             test_case_dict = yaml.safe_load(f)
+        with open(agent_config_path, "r") as f:
+            agent_config_dict = yaml.safe_load(f)
+        test_case_dict["agent"] = agent_config_dict["agent"]
         final_answer_criteria = []
         def add_gt_final_answer_criteria(ground_truth_list):

src/surf_spot_finder/evaluation/test_cases/alpha.yaml CHANGED Viewed

@@ -7,21 +7,6 @@ input:
   date: "2025-03-27 22:00"
   max_driving_hours: 3
   json_tracer: true
-agent:
-  api_key_var: "OPENAI_API_KEY"
-  api_base: null
-  model_id: "openai/o1"
-  agent_type: "smolagents"
-  tools:
-  - "surf_spot_finder.tools.driving_hours_to_meters"
-  - "surf_spot_finder.tools.get_area_lat_lon"
-  - "surf_spot_finder.tools.get_surfing_spots"
-  - "surf_spot_finder.tools.get_wave_forecast"
-  - "surf_spot_finder.tools.get_wind_forecast"
-  - "surf_spot_finder.tools.search_web"
-  - "surf_spot_finder.tools.visit_webpage"
-  - "smolagents.PythonInterpreterTool"
-  - "smolagents.FinalAnswerTool"
 ground_truth:

   date: "2025-03-27 22:00"
   max_driving_hours: 3
   json_tracer: true
 ground_truth: