Spaces:

mozilla-ai
/

surf-spot-finder

Running

Nathan Brake commited on Mar 24

Commit

7c69831

unverified ·

1 Parent(s): 9d27e7a

Uniform Trace extraction and inferring agent_type (#27)

* Need to re-tool the eval files but at least now evidence extraction is possible

* Linting

* linting

* fix unit tests

* lint

* patch langchain and openai output format

* lint

* fix pyproj

Files changed (12) hide show

pyproject.toml +2 -1
src/surf_spot_finder/agents/__init__.py +22 -7
src/surf_spot_finder/agents/langchain.py +7 -4
src/surf_spot_finder/agents/openai.py +8 -2
src/surf_spot_finder/cli.py +6 -3
src/surf_spot_finder/evaluation/evaluate.py +21 -28
src/surf_spot_finder/evaluation/telemetry_utils.py +301 -0
src/surf_spot_finder/evaluation/test_case.py +1 -0
src/surf_spot_finder/evaluation/test_cases/alpha.yaml +48 -13
src/surf_spot_finder/evaluation/utils.py +29 -69
src/surf_spot_finder/tracing.py +21 -17
tests/unit/test_unit_tracing.py +3 -7

pyproject.toml CHANGED Viewed

@@ -20,6 +20,7 @@ dependencies = [
 langchain = [
   "langchain",
   "langgraph",
   "openinference-instrumentation-langchain"
 ]
 smolagents = [
@@ -29,7 +30,7 @@ smolagents = [
 openai = [
   "openai-agents",
-  "openinference-instrumentation-openai-agents"
 ]
 mcp = [

 langchain = [
   "langchain",
   "langgraph",
+  "langchain-openai>=0.3.9",
   "openinference-instrumentation-langchain"
 ]
 smolagents = [
 openai = [
   "openai-agents",
+  "openinference-instrumentation-openai-agents>=0.1.2"
 ]
 mcp = [

src/surf_spot_finder/agents/__init__.py CHANGED Viewed

@@ -1,15 +1,30 @@
 from .langchain import run_lanchain_agent
 from .openai import run_openai_agent, run_openai_multi_agent
 from .smolagents import run_smolagent
 RUNNERS = {
-    "langchain": run_lanchain_agent,
-    "openai": run_openai_agent,
-    "smolagents": run_smolagent,
-    "openai_multi_agent": run_openai_multi_agent,
 }
-def validate_agent_type(value) -> str:
-    if value not in RUNNERS:
-        raise ValueError(f"agent_type must be one of {RUNNERS.keys()}")

+from enum import Enum
 from .langchain import run_lanchain_agent
 from .openai import run_openai_agent, run_openai_multi_agent
 from .smolagents import run_smolagent
+# Define the available agent type enums
+class AgentType(str, Enum):
+    LANGCHAIN = "langchain"
+    OPENAI = "openai"
+    OPENAI_MULTI_AGENT = "openai_multi_agent"
+    SMOLAGENTS = "smolagents"
 RUNNERS = {
+    AgentType.LANGCHAIN: run_lanchain_agent,
+    AgentType.OPENAI: run_openai_agent,
+    AgentType.SMOLAGENTS: run_smolagent,
+    AgentType.OPENAI_MULTI_AGENT: run_openai_multi_agent,
 }
+def validate_agent_type(value: str) -> str:
+    try:
+        agent_type = AgentType(value)
+        if agent_type not in RUNNERS:
+            raise ValueError(f"agent_type {value} is valid but has no runner")
+        return value
+    except ValueError:
+        raise ValueError(f"agent_type must be one of {[e.value for e in AgentType]}")

src/surf_spot_finder/agents/langchain.py CHANGED Viewed

@@ -14,6 +14,8 @@ try:
 except ImportError:
     langchain_available = False
 @logger.catch(reraise=True)
 def run_lanchain_agent(
@@ -52,13 +54,14 @@ def run_lanchain_agent(
     model = init_chat_model(model_id)
     agent = create_react_agent(
-        model=model,
-        tools=imported_tools,
-        checkpointer=MemorySaver(),
     )
     for step in agent.stream(
         {"messages": [HumanMessage(content=prompt)]},
-        {"configurable": {"thread_id": "abc123"}},
         stream_mode="values",
     ):
         step["messages"][-1].pretty_print()

 except ImportError:
     langchain_available = False
+DEFAULT_RECURSION_LIMIT = 50
 @logger.catch(reraise=True)
 def run_lanchain_agent(
     model = init_chat_model(model_id)
     agent = create_react_agent(
+        model=model, tools=imported_tools, checkpointer=MemorySaver()
     )
     for step in agent.stream(
         {"messages": [HumanMessage(content=prompt)]},
+        {
+            "configurable": {"thread_id": "abc123"},
+            "recursion_limit": DEFAULT_RECURSION_LIMIT,
+        },
         stream_mode="values",
     ):
         step["messages"][-1].pretty_print()

src/surf_spot_finder/agents/openai.py CHANGED Viewed

@@ -24,6 +24,8 @@ try:
 except ImportError:
     agents_available = None
 @logger.catch(reraise=True)
 def run_openai_agent(
@@ -34,6 +36,7 @@ def run_openai_agent(
     api_key_var: Optional[str] = None,
     api_base: Optional[str] = None,
     tools: Optional[list[str]] = None,
 ) -> RunResult:
     """Runs an OpenAI agent with the given prompt and configuration.
@@ -94,7 +97,7 @@ def run_openai_agent(
             name=name,
             tools=imported_tools,
         )
-    result = Runner.run_sync(agent, prompt)
     logger.info(result.final_output)
     return result
@@ -105,6 +108,7 @@ def run_openai_multi_agent(
     prompt: str,
     name: str = "surf-spot-finder",
     instructions: Optional[str] = MULTI_AGENT_SYSTEM_PROMPT,
     **kwargs,
 ) -> RunResult:
     """Runs multiple OpenAI agents orchestrated by a main agent.
@@ -176,6 +180,8 @@ def run_openai_multi_agent(
         ],
     )
-    result = Runner.run_sync(main_agent, prompt)
     logger.info(result.final_output)
     return result

 except ImportError:
     agents_available = None
+DEFAULT_MAX_TURNS = 20
 @logger.catch(reraise=True)
 def run_openai_agent(
     api_key_var: Optional[str] = None,
     api_base: Optional[str] = None,
     tools: Optional[list[str]] = None,
+    max_turns: Optional[int] = DEFAULT_MAX_TURNS,
 ) -> RunResult:
     """Runs an OpenAI agent with the given prompt and configuration.
             name=name,
             tools=imported_tools,
         )
+    result = Runner.run_sync(starting_agent=agent, input=prompt, max_turns=max_turns)
     logger.info(result.final_output)
     return result
     prompt: str,
     name: str = "surf-spot-finder",
     instructions: Optional[str] = MULTI_AGENT_SYSTEM_PROMPT,
+    max_turns: Optional[int] = DEFAULT_MAX_TURNS,
     **kwargs,
 ) -> RunResult:
     """Runs multiple OpenAI agents orchestrated by a main agent.
         ],
     )
+    result = Runner.run_sync(
+        starting_agent=main_agent, input=prompt, max_turns=max_turns
+    )
     logger.info(result.final_output)
     return result

src/surf_spot_finder/cli.py CHANGED Viewed

@@ -26,7 +26,7 @@ def find_surf_spot(
     api_base: Optional[str] = None,
     tools: Optional[list[dict]] = None,
     from_config: Optional[str] = None,
-):
     """Find the best surf spot based on the given criteria.
     Args:
@@ -71,8 +71,10 @@ def find_surf_spot(
         )
     logger.info("Setting up tracing")
-    tracer_provider, _ = get_tracer_provider(
-        project_name="surf-spot-finder", json_tracer=config.json_tracer
     )
     setup_tracing(tracer_provider, config.agent_type)
@@ -88,6 +90,7 @@ def find_surf_spot(
         api_key_var=config.api_key_var,
         tools=config.tools,
     )
 def main():

     api_base: Optional[str] = None,
     tools: Optional[list[dict]] = None,
     from_config: Optional[str] = None,
+) -> str:
     """Find the best surf spot based on the given criteria.
     Args:
         )
     logger.info("Setting up tracing")
+    tracer_provider, tracing_path = get_tracer_provider(
+        project_name="surf-spot-finder",
+        json_tracer=config.json_tracer,
+        agent_type=config.agent_type,
     )
     setup_tracing(tracer_provider, config.agent_type)
         api_key_var=config.api_key_var,
         tools=config.tools,
     )
+    return tracing_path
 def main():

src/surf_spot_finder/evaluation/evaluate.py CHANGED Viewed

@@ -4,17 +4,17 @@ from textwrap import dedent
 from typing import Any, Dict, List, Optional
 from loguru import logger
 from fire import Fire
-from surf_spot_finder.agents.smolagents import run_smolagent
 from surf_spot_finder.config import (
-    DEFAULT_PROMPT,
     Config,
 )
-from surf_spot_finder.tracing import get_tracer_provider, setup_tracing
 from surf_spot_finder.evaluation.utils import (
-    extract_hypothesis_answer,
     verify_checkpoints,
     verify_hypothesis_answer,
 )
 from surf_spot_finder.evaluation.test_case import TestCase
 logger.remove()
@@ -31,31 +31,23 @@ def run_agent(test_case: TestCase) -> str:
         max_driving_hours=input_data.max_driving_hours,
         model_id=input_data.model_id,
         api_key_var=input_data.api_key_var,
-        prompt=DEFAULT_PROMPT,
         json_tracer=input_data.json_tracer,
         api_base=input_data.api_base,
         agent_type=input_data.agent_type,
     )
-    # project_name is a name + uuid
-    project_name = "surf-spot-finder"
-    logger.info("Setting up tracing")
-    tracer_provider, telemetry_path = get_tracer_provider(
-        project_name=project_name, json_tracer=config.json_tracer
-    )
-    setup_tracing(tracer_provider, agent_type=config.agent_type)
-    logger.info("Running agent")
-    run_smolagent(
         model_id=config.model_id,
         api_key_var=config.api_key_var,
         api_base=config.api_base,
-        prompt=config.prompt.format(
-            LOCATION=config.location,
-            MAX_DRIVING_HOURS=config.max_driving_hours,
-            DATE=config.date,
-        ),
     )
-    return telemetry_path
 def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
@@ -64,8 +56,12 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
         telemetry: List[Dict[str, Any]] = json.loads(f.read())
     logger.info(f"Telemetry loaded from {telemetry_path}")
     # Extract the final answer from the telemetry
-    hypothesis_answer = extract_hypothesis_answer(telemetry)
     logger.info(
         f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>"""
     )
@@ -75,6 +71,7 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
         telemetry=telemetry,
         checkpoints=test_case.checkpoints,
         model=llm_judge,
     )
     hypothesis_answer_results = verify_hypothesis_answer(
@@ -110,12 +107,8 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
             logger.error(message)
     else:
         logger.info("<green>All checkpoints passed!</green>")
-    logger.info(
-        f"<green>Passed checkpoints: {len(passed_checks)}/{len(verification_results)}</green>"
-    )
-    logger.info(
-        f"<red>Failed checkpoints: {len(failed_checks)}/{len(verification_results)}</red>"
-    )
     logger.info("<green>=====================================</green>")
     logger.info(f"<green>Score: {won_points}/{won_points + missed_points}</green>")
     logger.info("<green>=====================================</green>")

 from typing import Any, Dict, List, Optional
 from loguru import logger
 from fire import Fire
+from surf_spot_finder.cli import find_surf_spot
 from surf_spot_finder.config import (
     Config,
 )
+from surf_spot_finder.prompts.shared import INPUT_PROMPT
 from surf_spot_finder.evaluation.utils import (
+    determine_agent_type,
     verify_checkpoints,
     verify_hypothesis_answer,
 )
+from surf_spot_finder.evaluation.telemetry_utils import extract_hypothesis_answer
 from surf_spot_finder.evaluation.test_case import TestCase
 logger.remove()
         max_driving_hours=input_data.max_driving_hours,
         model_id=input_data.model_id,
         api_key_var=input_data.api_key_var,
+        prompt=INPUT_PROMPT,
         json_tracer=input_data.json_tracer,
         api_base=input_data.api_base,
         agent_type=input_data.agent_type,
+        tools=input_data.tools,
     )
+    return find_surf_spot(
+        location=config.location,
+        date=config.date,
+        max_driving_hours=config.max_driving_hours,
+        agent_type=config.agent_type,
         model_id=config.model_id,
         api_key_var=config.api_key_var,
+        json_tracer=config.json_tracer,
         api_base=config.api_base,
+        tools=config.tools,
     )
 def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
         telemetry: List[Dict[str, Any]] = json.loads(f.read())
     logger.info(f"Telemetry loaded from {telemetry_path}")
+    agent_type = determine_agent_type(telemetry)
     # Extract the final answer from the telemetry
+    hypothesis_answer = extract_hypothesis_answer(
+        trace=telemetry, agent_type=agent_type
+    )
     logger.info(
         f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>"""
     )
         telemetry=telemetry,
         checkpoints=test_case.checkpoints,
         model=llm_judge,
+        agent_type=agent_type,
     )
     hypothesis_answer_results = verify_hypothesis_answer(
             logger.error(message)
     else:
         logger.info("<green>All checkpoints passed!</green>")
+    logger.info(f"<green>Passed checkpoints: {len(passed_checks)}</green>")
+    logger.info(f"<red>Failed checkpoints: {len(failed_checks)}</red>")
     logger.info("<green>=====================================</green>")
     logger.info(f"<green>Score: {won_points}/{won_points + missed_points}</green>")
     logger.info("<green>=====================================</green>")

src/surf_spot_finder/evaluation/telemetry_utils.py ADDED Viewed

	@@ -0,0 +1,301 @@

+from typing import Any, Dict, List
+import json
+from langchain_core.messages import BaseMessage
+import re
+from surf_spot_finder.agents import AgentType
+def extract_hypothesis_answer(
+    trace: List[Dict[str, Any]], agent_type: AgentType
+) -> str:
+    """Extract the hypothesis agent final answer from the trace"""
+    for span in reversed(trace):
+        if agent_type == AgentType.LANGCHAIN:
+            if span["attributes"]["openinference.span.kind"] == "AGENT":
+                content = span["attributes"]["output.value"]
+                # If it's langchain, the actual content is a serialized langchain message that we need to extract.
+                message = json.loads(content)["messages"][0]
+                message = parse_generic_key_value_string(message)
+                base_message = BaseMessage(**message, type="AGENT")
+                print(base_message.text())
+                return base_message.text()
+        elif agent_type == AgentType.SMOLAGENTS:
+            if span["attributes"]["openinference.span.kind"] == "AGENT":
+                content = span["attributes"]["output.value"]
+                # If it's langchain, the actual content is a serialized langchain message that we need to extract.
+                return content
+        elif agent_type == AgentType.OPENAI:
+            # Looking for the final response that has the summary answer
+            if (
+                "attributes" in span
+                and span.get("attributes", {}).get("openinference.span.kind") == "LLM"
+            ):
+                output_key = (
+                    "llm.output_messages.0.message.contents.0.message_content.text"
+                )
+                if output_key in span["attributes"]:
+                    return span["attributes"][output_key]
+        else:
+            raise ValueError(f"Unsupported agent type {agent_type}")
+    raise ValueError("No agent final answer found in trace")
+def parse_generic_key_value_string(text):
+    """
+    Parse a string that has items of a dict with key-value pairs separated by '='.
+    Only splits on '=' signs, handling quoted strings properly.
+    I think this is to compensate for a bug in openinference? https://github.com/Arize-ai/openinference/issues/1401
+    """
+    # Pattern to match key=value pairs, handling quoted values
+    # This regex looks for word characters followed by = and then captures everything
+    # until it finds another word character followed by = or the end of the string
+    # Claude helped me with this one, regex is hard
+    pattern = r"(\w+)=('.*?'|\".*?\"|[^'\"=]*?)(?=\s+\w+=|\s*$)"
+    result = {}
+    matches = re.findall(pattern, text)
+    for key, value in matches:
+        # Clean up the key
+        key = key.strip()
+        # Clean up the value - remove surrounding quotes if present
+        if (value.startswith("'") and value.endswith("'")) or (
+            value.startswith('"') and value.endswith('"')
+        ):
+            value = value[1:-1]
+        # Store in result dictionary
+        result[key] = value
+    return result
+def extract_evidence(telemetry: List[Dict[str, Any]], agent_type: AgentType) -> str:
+    """Extract relevant telemetry evidence based on the agent type."""
+    # Data extraction function for each agent type
+    extractors = {
+        AgentType.SMOLAGENTS: _extract_smolagents_data,
+        AgentType.LANGCHAIN: _extract_langchain_data,
+        AgentType.OPENAI: _extract_openai_data,
+    }
+    if agent_type not in extractors:
+        raise ValueError(f"Unsupported agent type {agent_type}")
+    # Extract raw data from telemetry
+    calls = extractors[agent_type](telemetry)
+    # Format data into a consistent structure
+    return _format_evidence(calls, agent_type)
+def _extract_smolagents_data(telemetry: List[Dict[str, Any]]) -> List[Dict]:
+    """Extract LLM calls and tool calls from SmoL Agents telemetry."""
+    calls = []
+    for span in telemetry:
+        # Skip spans without attributes
+        if "attributes" not in span:
+            continue
+        attributes = span["attributes"]
+        # Extract tool information
+        if "tool.name" in attributes or span.get("name", "").startswith("SimpleTool"):
+            tool_info = {
+                "tool_name": attributes.get(
+                    "tool.name", span.get("name", "Unknown tool")
+                ),
+                "status": "success"
+                if span.get("status", {}).get("status_code") == "OK"
+                else "error",
+                "error": span.get("status", {}).get("description", None),
+            }
+            # Extract input if available
+            if "input.value" in attributes:
+                try:
+                    input_value = json.loads(attributes["input.value"])
+                    if "kwargs" in input_value:
+                        # For SmoLAgents, the actual input is often in the kwargs field
+                        tool_info["input"] = input_value["kwargs"]
+                    else:
+                        tool_info["input"] = input_value
+                except (json.JSONDecodeError, TypeError):
+                    tool_info["input"] = attributes["input.value"]
+            # Extract output if available
+            if "output.value" in attributes:
+                try:
+                    # Try to parse JSON output
+                    output_value = (
+                        json.loads(attributes["output.value"])
+                        if isinstance(attributes["output.value"], str)
+                        else attributes["output.value"]
+                    )
+                    tool_info["output"] = output_value
+                except (json.JSONDecodeError, TypeError):
+                    tool_info["output"] = attributes["output.value"]
+            else:
+                tool_info["output"] = "No output found"
+            calls.append(tool_info)
+        # Extract LLM calls to see reasoning
+        elif "LiteLLMModel.__call__" in span.get("name", ""):
+            # The LLM output may be in different places depending on the implementation
+            output_content = None
+            # Try to get the output from the llm.output_messages.0.message.content attribute
+            if "llm.output_messages.0.message.content" in attributes:
+                output_content = attributes["llm.output_messages.0.message.content"]
+            # Or try to parse it from the output.value as JSON
+            elif "output.value" in attributes:
+                try:
+                    output_value = json.loads(attributes["output.value"])
+                    if "content" in output_value:
+                        output_content = output_value["content"]
+                except (json.JSONDecodeError, TypeError):
+                    pass
+            if output_content:
+                calls.append(
+                    {
+                        "model": attributes.get("llm.model_name", "Unknown model"),
+                        "output": output_content,
+                        "type": "reasoning",
+                    }
+                )
+    return calls
+def _extract_langchain_data(telemetry: List[Dict[str, Any]]) -> List:
+    """Extract LLM calls and tool calls from LangChain telemetry."""
+    calls = []
+    for span in telemetry:
+        if "attributes" not in span:
+            continue
+        attributes = span.get("attributes", {})
+        span_kind = attributes.get("openinference.span.kind", "")
+        # Collect LLM calls
+        if span_kind == "LLM" and "llm.output_messages.0.message.content" in attributes:
+            llm_info = {
+                "model": attributes.get("llm.model_name", "Unknown model"),
+                "input": attributes.get("llm.input_messages.0.message.content", ""),
+                "output": attributes.get("llm.output_messages.0.message.content", ""),
+                "type": "reasoning",
+            }
+            calls.append(llm_info)
+        # Try to find tool calls
+        if "tool.name" in attributes or span.get("name", "").endswith("Tool"):
+            tool_info = {
+                "tool_name": attributes.get(
+                    "tool.name", span.get("name", "Unknown tool")
+                ),
+                "status": "success"
+                if span.get("status", {}).get("status_code") == "OK"
+                else "error",
+                "error": span.get("status", {}).get("description", None),
+            }
+            if "input.value" in attributes:
+                try:
+                    input_value = json.loads(attributes["input.value"])
+                    tool_info["input"] = input_value
+                except Exception:
+                    tool_info["input"] = attributes["input.value"]
+            if "output.value" in attributes:
+                tool_info["output"] = parse_generic_key_value_string(
+                    json.loads(attributes["output.value"])["output"]
+                )["content"]
+            calls.append(tool_info)
+    return calls
+def _extract_openai_data(telemetry: List[Dict[str, Any]]) -> list:
+    """Extract LLM calls and tool calls from OpenAI telemetry."""
+    calls = []
+    for span in telemetry:
+        if "attributes" not in span:
+            continue
+        attributes = span.get("attributes", {})
+        span_kind = attributes.get("openinference.span.kind", "")
+        # Collect LLM interactions - look for direct message content first
+        if span_kind == "LLM":
+            # Initialize the LLM info dictionary
+            span_info = {}
+            # Try to get input message
+            input_key = "llm.input_messages.1.message.content"  # User message is usually at index 1
+            if input_key in attributes:
+                span_info["input"] = attributes[input_key]
+            # Try to get output message directly
+            output_content = None
+            # Try in multiple possible locations
+            for key in [
+                "llm.output_messages.0.message.content",
+                "llm.output_messages.0.message.contents.0.message_content.text",
+            ]:
+                if key in attributes:
+                    output_content = attributes[key]
+                    break
+            # If we found direct output content, use it
+            if output_content:
+                span_info["output"] = output_content
+                calls.append(span_info)
+        elif span_kind == "TOOL":
+            tool_name = attributes.get("tool.name", "Unknown tool")
+            tool_output = attributes.get("output.value", "")
+            span_info = {
+                "tool_name": tool_name,
+                "input": attributes.get("input.value", ""),
+                "output": tool_output,
+                "status": span.get("status", {}).get("status_code"),
+            }
+            span_info["input"] = json.loads(span_info["input"])
+            calls.append(span_info)
+    return calls
+def _format_evidence(calls: List[Dict], agent_type: AgentType) -> str:
+    """Format extracted data into a standardized output format."""
+    evidence = f"## {agent_type.name} Agent Execution\n\n"
+    for idx, call in enumerate(calls, start=1):
+        evidence += f"### Call {idx}\n"
+        # Truncate any values that are too long
+        max_length = 400
+        call = {
+            k: (
+                v[:max_length] + "..."
+                if isinstance(v, str) and len(v) > max_length
+                else v
+            )
+            for k, v in call.items()
+        }
+        # Use ensure_ascii=False to prevent escaping Unicode characters
+        evidence += json.dumps(call, indent=2, ensure_ascii=False) + "\n\n"
+    return evidence

src/surf_spot_finder/evaluation/test_case.py CHANGED Viewed

@@ -15,6 +15,7 @@ class InputModel(BaseModel):
     json_tracer: bool
     api_base: Optional[str] = None
     agent_type: str
 class CheckpointCriteria(BaseModel):

     json_tracer: bool
     api_base: Optional[str] = None
     agent_type: str
+    tools: Optional[List[str]] = None
 class CheckpointCriteria(BaseModel):

src/surf_spot_finder/evaluation/test_cases/alpha.yaml CHANGED Viewed

@@ -1,30 +1,65 @@
 # Test case for surf spot finder
 input:
   location: "Vigo"
-  date: "2025-03-15 22:00"
   max_driving_hours: 3
-  model_id: "openai/o3-mini"
   api_key_var: "OPENAI_API_KEY"
   json_tracer: true
   api_base: null
-  agent_type: "smolagents"
 ground_truth:
   - name: "Surf location"
     points: 5
-    value: "Playa de Patos"
-  - name: "Water temperature"
-    points: 1
-    value: "about 14°C +-5°C"
-  - name: "Wave height"
-    points: 1
-    value: "about 1 meter"
 # Base checkpoints for agent behavior
 # These evaluators for these checkpoints
 # will not consider the hypothesis answer or final answer in their decision making
 checkpoints:
   - points: 1
-    criteria: "Check if the agent consulted DuckDuckGoSearchTool for locations near Vigo."
   - points: 1
-    criteria: "Check if the agent fetched a website for forecasting, not relying on text from a DuckDuckGo search."

 # Test case for surf spot finder
+# You only need this input data if you want to run the test case, if you pass in a path to a telemetry file this
+# is ignored
 input:
   location: "Vigo"
+  date: "2025-03-27 22:00"
   max_driving_hours: 3
   api_key_var: "OPENAI_API_KEY"
   json_tracer: true
   api_base: null
+  # model_id: "openai/o1"
+  # agent_type: "smolagents"
+  # tools:
+  # - "surf_spot_finder.tools.driving_hours_to_meters"
+  # - "surf_spot_finder.tools.get_area_lat_lon"
+  # - "surf_spot_finder.tools.get_surfing_spots"
+  # - "surf_spot_finder.tools.get_wave_forecast"
+  # - "surf_spot_finder.tools.get_wind_forecast"
+  # - "surf_spot_finder.tools.search_web"
+  # - "surf_spot_finder.tools.visit_webpage"
+  # - "smolagents.PythonInterpreterTool"
+  # - "smolagents.FinalAnswerTool"
+  agent_type: langchain
+  model_id: o1
+  tools:
+  - "surf_spot_finder.tools.driving_hours_to_meters"
+  - "surf_spot_finder.tools.get_area_lat_lon"
+  - "surf_spot_finder.tools.get_surfing_spots"
+  - "surf_spot_finder.tools.get_wave_forecast"
+  - "surf_spot_finder.tools.get_wind_forecast"
+  - "surf_spot_finder.tools.search_web"
+  - "surf_spot_finder.tools.visit_webpage"
+  # model_id: o3-mini
+  # agent_type: openai
+  # tools:
+  # - "surf_spot_finder.tools.driving_hours_to_meters"
+  # - "surf_spot_finder.tools.get_area_lat_lon"
+  # - "surf_spot_finder.tools.get_surfing_spots"
+  # - "surf_spot_finder.tools.get_wave_forecast"
+  # - "surf_spot_finder.tools.get_wind_forecast"
+  # - "surf_spot_finder.tools.search_web"
+  # - "surf_spot_finder.tools.show_plan"
+  # - "surf_spot_finder.tools.visit_webpage"
 ground_truth:
   - name: "Surf location"
     points: 5
+    value: "Playa de Samil"
 # Base checkpoints for agent behavior
 # These evaluators for these checkpoints
 # will not consider the hypothesis answer or final answer in their decision making
 checkpoints:
   - points: 1
+    criteria: "Check if the agent did a web search for nearby surf locations."
+  - points: 1
+    criteria: "Check if the agent used the get_surfing_spots tool and it succeeded"
+  - points: 1
+    criteria: "Check if the agent used the get_wave_forecast tool and it succeeded"
+  - points: 1
+    criteria: "Check if the agent used the get_wind_forecast tool and it succeeded"
+  - points: 1
+    criteria: "Check if the agent used the get_area_lat_lon tool and it succeeded"
   - points: 1
+    criteria: "Check if the final answer contains any description about the weather at the chosen location"

src/surf_spot_finder/evaluation/utils.py CHANGED Viewed

@@ -4,10 +4,36 @@ import re
 from litellm import completion
 from textwrap import dedent
 from pydantic import BaseModel, ConfigDict
 from surf_spot_finder.evaluation.test_case import CheckpointCriteria
 class EvaluationResult(BaseModel):
     """Represents the result of evaluating a criterion"""
@@ -19,15 +45,6 @@ class EvaluationResult(BaseModel):
     points: int
-def extract_hypothesis_answer(telemetry: List[Dict[str, Any]]) -> str | None:
-    """Extract the hypothesis agent final answer from the telemetry data"""
-    for span in reversed(telemetry):
-        if span.get("attributes", {}).get("openinference.span.kind") == "AGENT":
-            hypo = span.get("attributes", {}).get("output.value")
-            return hypo
-    raise ValueError("Final answer not found in telemetry")
 def evaluate_criterion(
     criteria: str,
     model: str,
@@ -109,6 +126,7 @@ def verify_checkpoints(
     telemetry: List[Dict[str, Any]],
     checkpoints: List[CheckpointCriteria],
     model: str,
 ) -> List[EvaluationResult]:
     """Verify each checkpoint against the telemetry data using LLM
     These checkpoints do not take the ground truth or hyupothesis
@@ -117,9 +135,10 @@ def verify_checkpoints(
     """
     results = []
     for checkpoint in checkpoints:
         criteria = checkpoint.criteria
-        evidence = extract_relevant_evidence(telemetry, criteria)
         evaluation = evaluate_criterion(
             criteria=criteria,
@@ -156,62 +175,3 @@ def verify_hypothesis_answer(
         results.append(evaluation)
     return results
-def extract_relevant_evidence(telemetry: List[Dict[str, Any]], criteria: str) -> str:
-    """Extract relevant telemetry evidence based on the checkpoint criteria
-    TODO this is not a very robust implementation, since it requires knowledge about which tools have been
-    implemented. We should abstract this so that it can dynamically figure out what tools may have been used
-    and check for them appropriately. I understand that this tool should probably have some better way of abstracting
-    relevant information from the opentelemetry spans."""
-    evidence = ""
-    # Look for evidence of tool usage
-    if "DuckDuckGoSearchTool" in criteria:
-        search_spans = [
-            span for span in telemetry if span.get("name") == "DuckDuckGoSearchTool"
-        ]
-        evidence += f"Search tool was used {len(search_spans)} times.\n"
-        for i, span in enumerate(search_spans):  # Limit to first 3 searches
-            if "attributes" in span and "input.value" in span["attributes"]:
-                try:
-                    input_value = json.loads(span["attributes"]["input.value"])
-                    if "kwargs" in input_value and "query" in input_value["kwargs"]:
-                        evidence += (
-                            f"Search query {i + 1}: {input_value['kwargs']['query']}\n"
-                        )
-                except (json.JSONDecodeError, TypeError):
-                    pass
-    # Look for evidence of website fetching
-    if "fetched a website" in criteria:
-        fetch_spans = [
-            span
-            for span in telemetry
-            if span.get("attributes", {}).get("tool.name") == "fetch"
-        ]
-        evidence += f"Website fetch tool was used {len(fetch_spans)} times.\n"
-        for i, span in enumerate(fetch_spans):  # Limit to first 3 fetches
-            if "attributes" in span and "input.value" in span["attributes"]:
-                try:
-                    input_value = json.loads(span["attributes"]["input.value"])
-                    if "kwargs" in input_value and "url" in input_value["kwargs"]:
-                        evidence += (
-                            f"Fetched URL {i + 1}: {input_value['kwargs']['url']}\n"
-                        )
-                except (json.JSONDecodeError, TypeError):
-                    pass
-    # Add general evidence about all tool calls
-    tool_calls = {}
-    for span in telemetry:
-        if "name" in span and span["name"] not in tool_calls:
-            tool_calls[span["name"]] = 1
-        elif "name" in span:
-            tool_calls[span["name"]] += 1
-    evidence += "\nTool calls summary:\n"
-    for tool, count in tool_calls.items():
-        evidence += f"- {tool}: {count} call(s)\n"
-    return evidence

 from litellm import completion
 from textwrap import dedent
+from loguru import logger
 from pydantic import BaseModel, ConfigDict
+from surf_spot_finder.evaluation.telemetry_utils import extract_evidence
 from surf_spot_finder.evaluation.test_case import CheckpointCriteria
+from surf_spot_finder.agents import AgentType
+def determine_agent_type(trace: List[Dict[str, Any]]) -> AgentType:
+    """Determine the agent type based on the trace.
+    These are not really stable ways to find it, because we're waiting on some
+    reliable method for determining the agent type. This is a temporary solution.
+    """
+    for span in trace:
+        if "langchain" in span.get("attributes", {}).get("input.value", ""):
+            logger.info("Agent type is LANGCHAIN")
+            return AgentType.LANGCHAIN
+        if span.get("attributes", {}).get("smolagents.max_steps"):
+            logger.info("Agent type is SMOLAGENTS")
+            return AgentType.SMOLAGENTS
+        # This is extremely fragile but there currently isn't
+        # any specific key to indicate the agent type
+        if span.get("name") == "response":
+            logger.info("Agent type is OPENAI")
+            return AgentType.OPENAI
+    raise ValueError(
+        "Could not determine agent type from trace, or agent type not supported"
+    )
 class EvaluationResult(BaseModel):
     """Represents the result of evaluating a criterion"""
     points: int
 def evaluate_criterion(
     criteria: str,
     model: str,
     telemetry: List[Dict[str, Any]],
     checkpoints: List[CheckpointCriteria],
     model: str,
+    agent_type: AgentType,
 ) -> List[EvaluationResult]:
     """Verify each checkpoint against the telemetry data using LLM
     These checkpoints do not take the ground truth or hyupothesis
     """
     results = []
+    evidence = extract_evidence(telemetry, agent_type)
+    print(evidence)
     for checkpoint in checkpoints:
         criteria = checkpoint.criteria
         evaluation = evaluate_criterion(
             criteria=criteria,
         results.append(evaluation)
     return results

src/surf_spot_finder/tracing.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import os
 import json
 from datetime import datetime
-from opentelemetry import trace
 from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import SimpleSpanProcessor
-from opentelemetry.sdk.trace.export import SpanExporter
 class JsonFileSpanExporter(SpanExporter):
@@ -44,7 +43,10 @@ class JsonFileSpanExporter(SpanExporter):
 def get_tracer_provider(
-    project_name: str, json_tracer: bool, output_dir: str = "telemetry_output"
 ) -> tuple[TracerProvider, str | None]:
     """
     Create a tracer_provider based on the selected mode.
@@ -52,6 +54,7 @@ def get_tracer_provider(
     Args:
         project_name: Name of the project for tracing
         json_tracer: Whether to use the custom JSON file exporter (True) or Phoenix (False)
         output_dir: The directory where the telemetry output will be stored.
             Only used if `json_tracer=True`.
             Defaults to "telemetry_output".
@@ -66,9 +69,7 @@ def get_tracer_provider(
         timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
         tracer_provider = TracerProvider()
-        trace.set_tracer_provider(tracer_provider)
-        file_name = f"{output_dir}/{project_name}-{timestamp}.json"
         json_file_exporter = JsonFileSpanExporter(file_name=file_name)
         span_processor = SimpleSpanProcessor(json_file_exporter)
         tracer_provider.add_span_processor(span_processor)
@@ -97,14 +98,17 @@ def setup_tracing(tracer_provider: TracerProvider, agent_type: str) -> None:
     validate_agent_type(agent_type)
     if "openai" in agent_type:
-        from openinference.instrumentation.openai_agents import OpenAIAgentsInstrumentor
-        OpenAIAgentsInstrumentor().instrument(tracer_provider=tracer_provider)
     elif agent_type == "smolagents":
-        from openinference.instrumentation.smolagents import SmolagentsInstrumentor
-        SmolagentsInstrumentor().instrument(tracer_provider=tracer_provider)
     elif agent_type == "langchain":
-        from openinference.instrumentation.langchain import LangChainInstrumentor
-        LangChainInstrumentor().instrument(tracer_provider=tracer_provider)

 import os
 import json
 from datetime import datetime
 from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExporter
+from surf_spot_finder.agents import AgentType
 class JsonFileSpanExporter(SpanExporter):
 def get_tracer_provider(
+    project_name: str,
+    json_tracer: bool,
+    agent_type: AgentType,
+    output_dir: str = "telemetry_output",
 ) -> tuple[TracerProvider, str | None]:
     """
     Create a tracer_provider based on the selected mode.
     Args:
         project_name: Name of the project for tracing
         json_tracer: Whether to use the custom JSON file exporter (True) or Phoenix (False)
+        agent_type: The type of agent being used.
         output_dir: The directory where the telemetry output will be stored.
             Only used if `json_tracer=True`.
             Defaults to "telemetry_output".
         timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
         tracer_provider = TracerProvider()
+        file_name = f"{output_dir}/{agent_type}-{project_name}-{timestamp}.json"
         json_file_exporter = JsonFileSpanExporter(file_name=file_name)
         span_processor = SimpleSpanProcessor(json_file_exporter)
         tracer_provider.add_span_processor(span_processor)
     validate_agent_type(agent_type)
     if "openai" in agent_type:
+        from openinference.instrumentation.openai_agents import (
+            OpenAIAgentsInstrumentor as Instrumentor,
+        )
     elif agent_type == "smolagents":
+        from openinference.instrumentation.smolagents import (
+            SmolagentsInstrumentor as Instrumentor,
+        )
     elif agent_type == "langchain":
+        from openinference.instrumentation.langchain import (
+            LangChainInstrumentor as Instrumentor,
+        )
+    else:
+        raise ValueError(f"Unsupported agent type: {agent_type}")
+    Instrumentor().instrument(tracer_provider=tracer_provider)

tests/unit/test_unit_tracing.py CHANGED Viewed

@@ -2,31 +2,27 @@ from unittest.mock import patch, MagicMock
 import pytest
 from surf_spot_finder.tracing import get_tracer_provider, setup_tracing
 @pytest.mark.parametrize("json_tracer", [True, False])
 def test_get_tracer_provider(tmp_path, json_tracer):
-    mock_trace = MagicMock()
     mock_tracer_provider = MagicMock()
     mock_register = MagicMock()
     with (
-        patch("surf_spot_finder.tracing.trace", mock_trace),
         patch("surf_spot_finder.tracing.TracerProvider", mock_tracer_provider),
         patch("phoenix.otel.register", mock_register),
     ):
         get_tracer_provider(
             project_name="test_project",
             json_tracer=json_tracer,
             output_dir=tmp_path / "telemetry",
         )
         assert (tmp_path / "telemetry").exists() == json_tracer
-        if json_tracer:
-            mock_trace.set_tracer_provider.assert_called_once_with(
-                mock_tracer_provider.return_value
-            )
-        else:
             mock_register.assert_called_once_with(
                 project_name="test_project", set_global_tracer_provider=True
             )

 import pytest
+from surf_spot_finder.agents import AgentType
 from surf_spot_finder.tracing import get_tracer_provider, setup_tracing
 @pytest.mark.parametrize("json_tracer", [True, False])
 def test_get_tracer_provider(tmp_path, json_tracer):
     mock_tracer_provider = MagicMock()
     mock_register = MagicMock()
     with (
         patch("surf_spot_finder.tracing.TracerProvider", mock_tracer_provider),
         patch("phoenix.otel.register", mock_register),
     ):
         get_tracer_provider(
             project_name="test_project",
             json_tracer=json_tracer,
+            agent_type=AgentType.SMOLAGENTS,
             output_dir=tmp_path / "telemetry",
         )
         assert (tmp_path / "telemetry").exists() == json_tracer
+        if not json_tracer:
             mock_register.assert_called_once_with(
                 project_name="test_project", set_global_tracer_provider=True
             )