Spaces:

mozilla-ai
/

surf-spot-finder

Building

App Files Files Community

Nathan Brake commited on Mar 24

Commit

fea07c2

unverified ·

1 Parent(s): ef766f7

Split telemetry processing into cleaner classes, support ollama (#31)

Browse files

Files changed (13) hide show

.pylintrc +2 -0
examples/langchain_single_agent_vertical.yaml +1 -0
pyproject.toml +1 -0
src/surf_spot_finder/agents/langchain.py +5 -2
src/surf_spot_finder/evaluation/evaluate.py +20 -27
src/surf_spot_finder/evaluation/telemetry/__init__.py +3 -0
src/surf_spot_finder/evaluation/telemetry/langchain_telemetry.py +80 -0
src/surf_spot_finder/evaluation/telemetry/openai_telemetry.py +102 -0
src/surf_spot_finder/evaluation/telemetry/smolagents_telemetry.py +103 -0
src/surf_spot_finder/evaluation/telemetry/telemetry.py +125 -0
src/surf_spot_finder/evaluation/telemetry_utils.py +0 -301
src/surf_spot_finder/evaluation/test_case.py +2 -15
src/surf_spot_finder/evaluation/utils.py +3 -29

.pylintrc ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [MESSAGES CONTROL]
2	+ disable=C0415

examples/langchain_single_agent_vertical.yaml CHANGED Viewed

@@ -5,6 +5,7 @@ input:
   # input_prompt_template:
 agent:
   model_id: o3-mini
   agent_type: langchain
   tools:
   - "surf_spot_finder.tools.driving_hours_to_meters"

   # input_prompt_template:
 agent:
   model_id: o3-mini
+  # model_id: ollama/llama3.2:3b
   agent_type: langchain
   tools:
   - "surf_spot_finder.tools.driving_hours_to_meters"

pyproject.toml CHANGED Viewed

@@ -21,6 +21,7 @@ langchain = [
   "langchain",
   "langgraph",
   "langchain-openai>=0.3.9",
   "openinference-instrumentation-langchain"
 ]
 smolagents = [

   "langchain",
   "langgraph",
   "langchain-openai>=0.3.9",
+  "langchain-ollama>=0.3.0",
   "openinference-instrumentation-langchain"
 ]
 smolagents = [

src/surf_spot_finder/agents/langchain.py CHANGED Viewed

@@ -51,8 +51,11 @@ def run_lanchain_agent(
         if not isinstance(imported_tool, BaseTool):
             imported_tool = tool(imported_tool)
         imported_tools.append((imported_tool))
-    model = init_chat_model(model_id)
     agent = create_react_agent(
         model=model, tools=imported_tools, checkpointer=MemorySaver()
     )

         if not isinstance(imported_tool, BaseTool):
             imported_tool = tool(imported_tool)
         imported_tools.append((imported_tool))
+    if "/" in model_id:
+        model_provider, model_id = model_id.split("/")
+        model = init_chat_model(model_id, model_provider=model_provider)
+    else:
+        model = init_chat_model(model_id)
     agent = create_react_agent(
         model=model, tools=imported_tools, checkpointer=MemorySaver()
     )

src/surf_spot_finder/evaluation/evaluate.py CHANGED Viewed

@@ -8,13 +8,11 @@ from surf_spot_finder.cli import find_surf_spot
 from surf_spot_finder.config import (
     Config,
 )
-from surf_spot_finder.prompts.shared import INPUT_PROMPT
 from surf_spot_finder.evaluation.utils import (
-    determine_agent_type,
     verify_checkpoints,
     verify_hypothesis_answer,
 )
-from surf_spot_finder.evaluation.telemetry_utils import extract_hypothesis_answer
 from surf_spot_finder.evaluation.test_case import TestCase
 logger.remove()
@@ -22,22 +20,15 @@ logger = logger.opt(ansi=True)
 logger.add(sys.stdout, colorize=True, format="{message}")
-def run_agent(test_case: TestCase) -> str:
     input_data = test_case.input
-    agent_config = test_case.agent
     logger.info("Loading config")
-    config = Config(
-        location=input_data.location,
-        date=input_data.date,
-        max_driving_hours=input_data.max_driving_hours,
-        model_id=agent_config.model_id,
-        api_key_var=agent_config.api_key_var,
-        prompt=INPUT_PROMPT,
-        json_tracer=input_data.json_tracer,
-        api_base=agent_config.api_base,
-        agent_type=agent_config.agent_type,
-        tools=agent_config.tools,
-    )
     return find_surf_spot(
         location=config.location,
         date=config.date,
@@ -57,12 +48,11 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
         telemetry: List[Dict[str, Any]] = json.loads(f.read())
     logger.info(f"Telemetry loaded from {telemetry_path}")
-    agent_type = determine_agent_type(telemetry)
     # Extract the final answer from the telemetry
-    hypothesis_answer = extract_hypothesis_answer(
-        trace=telemetry, agent_type=agent_type
-    )
     logger.info(
         f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>"""
     )
@@ -72,7 +62,7 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
         telemetry=telemetry,
         checkpoints=test_case.checkpoints,
         model=llm_judge,
-        agent_type=agent_type,
     )
     hypothesis_answer_results = verify_hypothesis_answer(
@@ -116,7 +106,9 @@ def evaluate_telemetry(test_case: TestCase, telemetry_path: str) -> bool:
 def evaluate(
-    test_case_path: str, agent_config_path: str, telemetry_path: Optional[str] = None
 ) -> None:
     """
     Evaluate agent performance using either a provided telemetry file or by running the agent.
@@ -125,15 +117,16 @@ def evaluate(
         telemetry_path: Optional path to an existing telemetry file. If not provided,
                         the agent will be run to generate one.
     """
-    test_case = TestCase.from_yaml(
-        test_case_path=test_case_path, agent_config_path=agent_config_path
-    )
     if telemetry_path is None:
         logger.info(
             "No telemetry path provided. Running agent to generate telemetry..."
         )
-        telemetry_path = run_agent(test_case)
     else:
         logger.info(f"Using provided telemetry file: {telemetry_path}")
         logger.info(

 from surf_spot_finder.config import (
     Config,
 )
+from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
 from surf_spot_finder.evaluation.utils import (
     verify_checkpoints,
     verify_hypothesis_answer,
 )
 from surf_spot_finder.evaluation.test_case import TestCase
 logger.remove()
 logger.add(sys.stdout, colorize=True, format="{message}")
+def run_agent(test_case: TestCase, agent_config_path: str) -> str:
     input_data = test_case.input
     logger.info("Loading config")
+    config = Config.from_yaml(agent_config_path)
+    config.location = input_data.location
+    config.date = input_data.date
+    config.max_driving_hours = input_data.max_driving_hours
+    config.json_tracer = input_data.json_tracer
     return find_surf_spot(
         location=config.location,
         date=config.date,
         telemetry: List[Dict[str, Any]] = json.loads(f.read())
     logger.info(f"Telemetry loaded from {telemetry_path}")
+    agent_type = TelemetryProcessor.determine_agent_type(telemetry)
     # Extract the final answer from the telemetry
+    processor = TelemetryProcessor.create(agent_type)
+    hypothesis_answer = processor.extract_hypothesis_answer(trace=telemetry)
     logger.info(
         f"""<yellow>Hypothesis Final answer extracted: {hypothesis_answer}</yellow>"""
     )
         telemetry=telemetry,
         checkpoints=test_case.checkpoints,
         model=llm_judge,
+        processor=processor,
     )
     hypothesis_answer_results = verify_hypothesis_answer(
 def evaluate(
+    test_case_path: str,
+    agent_config_path: str = None,
+    telemetry_path: Optional[str] = None,
 ) -> None:
     """
     Evaluate agent performance using either a provided telemetry file or by running the agent.
         telemetry_path: Optional path to an existing telemetry file. If not provided,
                         the agent will be run to generate one.
     """
+    test_case = TestCase.from_yaml(test_case_path=test_case_path)
     if telemetry_path is None:
         logger.info(
             "No telemetry path provided. Running agent to generate telemetry..."
         )
+        assert (
+            agent_config_path is not None
+        ), "Agent config path must be provided if running agent"
+        telemetry_path = run_agent(test_case, agent_config_path)
     else:
         logger.info(f"Using provided telemetry file: {telemetry_path}")
         logger.info(

src/surf_spot_finder/evaluation/telemetry/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .telemetry import TelemetryProcessor
2	+
3	+ __all__ = ["TelemetryProcessor"]

src/surf_spot_finder/evaluation/telemetry/langchain_telemetry.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Any, Dict, List
+import json
+from langchain_core.messages import BaseMessage
+from surf_spot_finder.agents import AgentType
+from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
+class LangchainTelemetryProcessor(TelemetryProcessor):
+    """Processor for Langchain agent telemetry data."""
+    def _get_agent_type(self) -> AgentType:
+        return AgentType.LANGCHAIN
+    def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
+        for span in reversed(trace):
+            if span["attributes"]["openinference.span.kind"] == "AGENT":
+                content = span["attributes"]["output.value"]
+                # Extract content from serialized langchain message
+                message = json.loads(content)["messages"][0]
+                message = self.parse_generic_key_value_string(message)
+                base_message = BaseMessage(content=message["content"], type="AGENT")
+                print(base_message.text())
+                return base_message.text()
+        raise ValueError("No agent final answer found in trace")
+    def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> List[Dict]:
+        """Extract LLM calls and tool calls from LangChain telemetry."""
+        calls = []
+        for span in telemetry:
+            if "attributes" not in span:
+                continue
+            attributes = span.get("attributes", {})
+            span_kind = attributes.get("openinference.span.kind", "")
+            # Collect LLM calls
+            if (
+                span_kind == "LLM"
+                and "llm.output_messages.0.message.content" in attributes
+            ):
+                llm_info = {
+                    "model": attributes.get("llm.model_name", "Unknown model"),
+                    "input": attributes.get("llm.input_messages.0.message.content", ""),
+                    "output": attributes.get(
+                        "llm.output_messages.0.message.content", ""
+                    ),
+                    "type": "reasoning",
+                }
+                calls.append(llm_info)
+            # Try to find tool calls
+            if "tool.name" in attributes or span.get("name", "").endswith("Tool"):
+                tool_info = {
+                    "tool_name": attributes.get(
+                        "tool.name", span.get("name", "Unknown tool")
+                    ),
+                    "status": "success"
+                    if span.get("status", {}).get("status_code") == "OK"
+                    else "error",
+                    "error": span.get("status", {}).get("description", None),
+                }
+                if "input.value" in attributes:
+                    try:
+                        input_value = json.loads(attributes["input.value"])
+                        tool_info["input"] = input_value
+                    except Exception:
+                        tool_info["input"] = attributes["input.value"]
+                if "output.value" in attributes:
+                    tool_info["output"] = self.parse_generic_key_value_string(
+                        json.loads(attributes["output.value"])["output"]
+                    )["content"]
+                calls.append(tool_info)
+        return calls

src/surf_spot_finder/evaluation/telemetry/openai_telemetry.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from typing import Any, Dict, List
+import json
+from surf_spot_finder.agents import AgentType
+from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
+class OpenAITelemetryProcessor(TelemetryProcessor):
+    """Processor for OpenAI agent telemetry data."""
+    def _get_agent_type(self) -> AgentType:
+        return AgentType.OPENAI
+    def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
+        for span in reversed(trace):
+            # Looking for the final response that has the summary answer
+            if (
+                "attributes" in span
+                and span.get("attributes", {}).get("openinference.span.kind") == "LLM"
+            ):
+                output_key = (
+                    "llm.output_messages.0.message.contents.0.message_content.text"
+                )
+                if output_key in span["attributes"]:
+                    return span["attributes"][output_key]
+        raise ValueError("No agent final answer found in trace")
+    def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> list:
+        """Extract LLM calls and tool calls from OpenAI telemetry."""
+        calls = []
+        for span in telemetry:
+            if "attributes" not in span:
+                continue
+            attributes = span.get("attributes", {})
+            span_kind = attributes.get("openinference.span.kind", "")
+            # Collect LLM interactions - look for direct message content first
+            if span_kind == "LLM":
+                # Initialize the LLM info dictionary
+                span_info = {}
+                # Try to get input message
+                input_key = "llm.input_messages.1.message.content"  # User message is usually at index 1
+                if input_key in attributes:
+                    span_info["input"] = attributes[input_key]
+                # Try to get output message directly
+                output_content = None
+                # Try in multiple possible locations
+                for key in [
+                    "llm.output_messages.0.message.content",
+                    "llm.output_messages.0.message.contents.0.message_content.text",
+                ]:
+                    if key in attributes:
+                        output_content = attributes[key]
+                        break
+                # If we found direct output content, use it
+                if output_content:
+                    span_info["output"] = output_content
+                    calls.append(span_info)
+            elif span_kind == "TOOL":
+                tool_name = attributes.get("tool.name", "Unknown tool")
+                tool_output = attributes.get("output.value", "")
+                span_info = {
+                    "tool_name": tool_name,
+                    "input": attributes.get("input.value", ""),
+                    "output": tool_output,
+                    "status": span.get("status", {}).get("status_code"),
+                }
+                span_info["input"] = json.loads(span_info["input"])
+                calls.append(span_info)
+        return calls
+# Backward compatibility functions that use the new class structure
+def extract_hypothesis_answer(
+    trace: List[Dict[str, Any]], agent_type: AgentType
+) -> str:
+    """Extract the hypothesis agent final answer from the trace"""
+    processor = TelemetryProcessor.create(agent_type)
+    return processor.extract_hypothesis_answer(trace)
+def parse_generic_key_value_string(text: str) -> Dict[str, str]:
+    """
+    Parse a string that has items of a dict with key-value pairs separated by '='.
+    Only splits on '=' signs, handling quoted strings properly.
+    """
+    return TelemetryProcessor.parse_generic_key_value_string(text)
+def extract_evidence(telemetry: List[Dict[str, Any]], agent_type: AgentType) -> str:
+    """Extract relevant telemetry evidence based on the agent type."""
+    processor = TelemetryProcessor.create(agent_type)
+    return processor.extract_evidence(telemetry)

src/surf_spot_finder/evaluation/telemetry/smolagents_telemetry.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from typing import Any, Dict, List
+import json
+from surf_spot_finder.agents import AgentType
+from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
+class SmolagentsTelemetryProcessor(TelemetryProcessor):
+    """Processor for SmoL Agents telemetry data."""
+    def _get_agent_type(self) -> AgentType:
+        return AgentType.SMOLAGENTS
+    def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
+        for span in reversed(trace):
+            if span["attributes"]["openinference.span.kind"] == "AGENT":
+                content = span["attributes"]["output.value"]
+                return content
+        raise ValueError("No agent final answer found in trace")
+    def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> List[Dict]:
+        """Extract LLM calls and tool calls from SmoL Agents telemetry."""
+        calls = []
+        for span in telemetry:
+            # Skip spans without attributes
+            if "attributes" not in span:
+                continue
+            attributes = span["attributes"]
+            # Extract tool information
+            if "tool.name" in attributes or span.get("name", "").startswith(
+                "SimpleTool"
+            ):
+                tool_info = {
+                    "tool_name": attributes.get(
+                        "tool.name", span.get("name", "Unknown tool")
+                    ),
+                    "status": "success"
+                    if span.get("status", {}).get("status_code") == "OK"
+                    else "error",
+                    "error": span.get("status", {}).get("description", None),
+                }
+                # Extract input if available
+                if "input.value" in attributes:
+                    try:
+                        input_value = json.loads(attributes["input.value"])
+                        if "kwargs" in input_value:
+                            # For SmoLAgents, the actual input is often in the kwargs field
+                            tool_info["input"] = input_value["kwargs"]
+                        else:
+                            tool_info["input"] = input_value
+                    except (json.JSONDecodeError, TypeError):
+                        tool_info["input"] = attributes["input.value"]
+                # Extract output if available
+                if "output.value" in attributes:
+                    try:
+                        # Try to parse JSON output
+                        output_value = (
+                            json.loads(attributes["output.value"])
+                            if isinstance(attributes["output.value"], str)
+                            else attributes["output.value"]
+                        )
+                        tool_info["output"] = output_value
+                    except (json.JSONDecodeError, TypeError):
+                        tool_info["output"] = attributes["output.value"]
+                else:
+                    tool_info["output"] = "No output found"
+                calls.append(tool_info)
+            # Extract LLM calls to see reasoning
+            elif "LiteLLMModel.__call__" in span.get("name", ""):
+                # The LLM output may be in different places depending on the implementation
+                output_content = None
+                # Try to get the output from the llm.output_messages.0.message.content attribute
+                if "llm.output_messages.0.message.content" in attributes:
+                    output_content = attributes["llm.output_messages.0.message.content"]
+                # Or try to parse it from the output.value as JSON
+                elif "output.value" in attributes:
+                    try:
+                        output_value = json.loads(attributes["output.value"])
+                        if "content" in output_value:
+                            output_content = output_value["content"]
+                    except (json.JSONDecodeError, TypeError):
+                        pass
+                if output_content:
+                    calls.append(
+                        {
+                            "model": attributes.get("llm.model_name", "Unknown model"),
+                            "output": output_content,
+                            "type": "reasoning",
+                        }
+                    )
+        return calls

src/surf_spot_finder/evaluation/telemetry/telemetry.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from typing import Any, Dict, List, ClassVar
+import json
+import re
+from abc import ABC, abstractmethod
+from loguru import logger
+from surf_spot_finder.agents import AgentType
+class TelemetryProcessor(ABC):
+    """Base class for processing telemetry data from different agent types."""
+    MAX_EVIDENCE_LENGTH: ClassVar[int] = 400
+    @classmethod
+    def create(cls, agent_type: AgentType) -> "TelemetryProcessor":
+        """Factory method to create the appropriate telemetry processor."""
+        if agent_type == AgentType.LANGCHAIN:
+            from surf_spot_finder.evaluation.telemetry.langchain_telemetry import (
+                LangchainTelemetryProcessor,
+            )
+            return LangchainTelemetryProcessor()
+        elif agent_type == AgentType.SMOLAGENTS:
+            from surf_spot_finder.evaluation.telemetry.smolagents_telemetry import (
+                SmolagentsTelemetryProcessor,
+            )
+            return SmolagentsTelemetryProcessor()
+        elif agent_type == AgentType.OPENAI:
+            from surf_spot_finder.evaluation.telemetry.openai_telemetry import (
+                OpenAITelemetryProcessor,
+            )
+            return OpenAITelemetryProcessor()
+        else:
+            raise ValueError(f"Unsupported agent type {agent_type}")
+    @staticmethod
+    def determine_agent_type(trace: List[Dict[str, Any]]) -> AgentType:
+        """Determine the agent type based on the trace.
+        These are not really stable ways to find it, because we're waiting on some
+        reliable method for determining the agent type. This is a temporary solution.
+        """
+        for span in trace:
+            if "langchain" in span.get("attributes", {}).get("input.value", ""):
+                logger.info("Agent type is LANGCHAIN")
+                return AgentType.LANGCHAIN
+            if span.get("attributes", {}).get("smolagents.max_steps"):
+                logger.info("Agent type is SMOLAGENTS")
+                return AgentType.SMOLAGENTS
+            # This is extremely fragile but there currently isn't
+            # any specific key to indicate the agent type
+            if span.get("name") == "response":
+                logger.info("Agent type is OPENAI")
+                return AgentType.OPENAI
+        raise ValueError(
+            "Could not determine agent type from trace, or agent type not supported"
+        )
+    @abstractmethod
+    def extract_hypothesis_answer(self, trace: List[Dict[str, Any]]) -> str:
+        """Extract the hypothesis agent final answer from the trace."""
+        pass
+    @abstractmethod
+    def _extract_telemetry_data(self, telemetry: List[Dict[str, Any]]) -> List[Dict]:
+        """Extract the agent-specific data from telemetry."""
+        pass
+    def extract_evidence(self, telemetry: List[Dict[str, Any]]) -> str:
+        """Extract relevant telemetry evidence."""
+        calls = self._extract_telemetry_data(telemetry)
+        return self._format_evidence(calls)
+    def _format_evidence(self, calls: List[Dict]) -> str:
+        """Format extracted data into a standardized output format."""
+        evidence = f"## {self._get_agent_type().name} Agent Execution\n\n"
+        for idx, call in enumerate(calls, start=1):
+            evidence += f"### Call {idx}\n"
+            # Truncate any values that are too long
+            call = {
+                k: (
+                    v[: self.MAX_EVIDENCE_LENGTH] + "..."
+                    if isinstance(v, str) and len(v) > self.MAX_EVIDENCE_LENGTH
+                    else v
+                )
+                for k, v in call.items()
+            }
+            # Use ensure_ascii=False to prevent escaping Unicode characters
+            evidence += json.dumps(call, indent=2, ensure_ascii=False) + "\n\n"
+        return evidence
+    @abstractmethod
+    def _get_agent_type(self) -> AgentType:
+        """Get the agent type associated with this processor."""
+        pass
+    @staticmethod
+    def parse_generic_key_value_string(text: str) -> Dict[str, str]:
+        """
+        Parse a string that has items of a dict with key-value pairs separated by '='.
+        Only splits on '=' signs, handling quoted strings properly.
+        """
+        pattern = r"(\w+)=('.*?'|\".*?\"|[^'\"=]*?)(?=\s+\w+=|\s*$)"
+        result = {}
+        matches = re.findall(pattern, text)
+        for key, value in matches:
+            # Clean up the key
+            key = key.strip()
+            # Clean up the value - remove surrounding quotes if present
+            if (value.startswith("'") and value.endswith("'")) or (
+                value.startswith('"') and value.endswith('"')
+            ):
+                value = value[1:-1]
+            # Store in result dictionary
+            result[key] = value
+        return result

src/surf_spot_finder/evaluation/telemetry_utils.py DELETED Viewed

@@ -1,301 +0,0 @@
-from typing import Any, Dict, List
-import json
-from langchain_core.messages import BaseMessage
-import re
-from surf_spot_finder.agents import AgentType
-def extract_hypothesis_answer(
-    trace: List[Dict[str, Any]], agent_type: AgentType
-) -> str:
-    """Extract the hypothesis agent final answer from the trace"""
-    for span in reversed(trace):
-        if agent_type == AgentType.LANGCHAIN:
-            if span["attributes"]["openinference.span.kind"] == "AGENT":
-                content = span["attributes"]["output.value"]
-                # If it's langchain, the actual content is a serialized langchain message that we need to extract.
-                message = json.loads(content)["messages"][0]
-                message = parse_generic_key_value_string(message)
-                base_message = BaseMessage(**message, type="AGENT")
-                print(base_message.text())
-                return base_message.text()
-        elif agent_type == AgentType.SMOLAGENTS:
-            if span["attributes"]["openinference.span.kind"] == "AGENT":
-                content = span["attributes"]["output.value"]
-                # If it's langchain, the actual content is a serialized langchain message that we need to extract.
-                return content
-        elif agent_type == AgentType.OPENAI:
-            # Looking for the final response that has the summary answer
-            if (
-                "attributes" in span
-                and span.get("attributes", {}).get("openinference.span.kind") == "LLM"
-            ):
-                output_key = (
-                    "llm.output_messages.0.message.contents.0.message_content.text"
-                )
-                if output_key in span["attributes"]:
-                    return span["attributes"][output_key]
-        else:
-            raise ValueError(f"Unsupported agent type {agent_type}")
-    raise ValueError("No agent final answer found in trace")
-def parse_generic_key_value_string(text):
-    """
-    Parse a string that has items of a dict with key-value pairs separated by '='.
-    Only splits on '=' signs, handling quoted strings properly.
-    I think this is to compensate for a bug in openinference? https://github.com/Arize-ai/openinference/issues/1401
-    """
-    # Pattern to match key=value pairs, handling quoted values
-    # This regex looks for word characters followed by = and then captures everything
-    # until it finds another word character followed by = or the end of the string
-    # Claude helped me with this one, regex is hard
-    pattern = r"(\w+)=('.*?'|\".*?\"|[^'\"=]*?)(?=\s+\w+=|\s*$)"
-    result = {}
-    matches = re.findall(pattern, text)
-    for key, value in matches:
-        # Clean up the key
-        key = key.strip()
-        # Clean up the value - remove surrounding quotes if present
-        if (value.startswith("'") and value.endswith("'")) or (
-            value.startswith('"') and value.endswith('"')
-        ):
-            value = value[1:-1]
-        # Store in result dictionary
-        result[key] = value
-    return result
-def extract_evidence(telemetry: List[Dict[str, Any]], agent_type: AgentType) -> str:
-    """Extract relevant telemetry evidence based on the agent type."""
-    # Data extraction function for each agent type
-    extractors = {
-        AgentType.SMOLAGENTS: _extract_smolagents_data,
-        AgentType.LANGCHAIN: _extract_langchain_data,
-        AgentType.OPENAI: _extract_openai_data,
-    }
-    if agent_type not in extractors:
-        raise ValueError(f"Unsupported agent type {agent_type}")
-    # Extract raw data from telemetry
-    calls = extractors[agent_type](telemetry)
-    # Format data into a consistent structure
-    return _format_evidence(calls, agent_type)
-def _extract_smolagents_data(telemetry: List[Dict[str, Any]]) -> List[Dict]:
-    """Extract LLM calls and tool calls from SmoL Agents telemetry."""
-    calls = []
-    for span in telemetry:
-        # Skip spans without attributes
-        if "attributes" not in span:
-            continue
-        attributes = span["attributes"]
-        # Extract tool information
-        if "tool.name" in attributes or span.get("name", "").startswith("SimpleTool"):
-            tool_info = {
-                "tool_name": attributes.get(
-                    "tool.name", span.get("name", "Unknown tool")
-                ),
-                "status": "success"
-                if span.get("status", {}).get("status_code") == "OK"
-                else "error",
-                "error": span.get("status", {}).get("description", None),
-            }
-            # Extract input if available
-            if "input.value" in attributes:
-                try:
-                    input_value = json.loads(attributes["input.value"])
-                    if "kwargs" in input_value:
-                        # For SmoLAgents, the actual input is often in the kwargs field
-                        tool_info["input"] = input_value["kwargs"]
-                    else:
-                        tool_info["input"] = input_value
-                except (json.JSONDecodeError, TypeError):
-                    tool_info["input"] = attributes["input.value"]
-            # Extract output if available
-            if "output.value" in attributes:
-                try:
-                    # Try to parse JSON output
-                    output_value = (
-                        json.loads(attributes["output.value"])
-                        if isinstance(attributes["output.value"], str)
-                        else attributes["output.value"]
-                    )
-                    tool_info["output"] = output_value
-                except (json.JSONDecodeError, TypeError):
-                    tool_info["output"] = attributes["output.value"]
-            else:
-                tool_info["output"] = "No output found"
-            calls.append(tool_info)
-        # Extract LLM calls to see reasoning
-        elif "LiteLLMModel.__call__" in span.get("name", ""):
-            # The LLM output may be in different places depending on the implementation
-            output_content = None
-            # Try to get the output from the llm.output_messages.0.message.content attribute
-            if "llm.output_messages.0.message.content" in attributes:
-                output_content = attributes["llm.output_messages.0.message.content"]
-            # Or try to parse it from the output.value as JSON
-            elif "output.value" in attributes:
-                try:
-                    output_value = json.loads(attributes["output.value"])
-                    if "content" in output_value:
-                        output_content = output_value["content"]
-                except (json.JSONDecodeError, TypeError):
-                    pass
-            if output_content:
-                calls.append(
-                    {
-                        "model": attributes.get("llm.model_name", "Unknown model"),
-                        "output": output_content,
-                        "type": "reasoning",
-                    }
-                )
-    return calls
-def _extract_langchain_data(telemetry: List[Dict[str, Any]]) -> List:
-    """Extract LLM calls and tool calls from LangChain telemetry."""
-    calls = []
-    for span in telemetry:
-        if "attributes" not in span:
-            continue
-        attributes = span.get("attributes", {})
-        span_kind = attributes.get("openinference.span.kind", "")
-        # Collect LLM calls
-        if span_kind == "LLM" and "llm.output_messages.0.message.content" in attributes:
-            llm_info = {
-                "model": attributes.get("llm.model_name", "Unknown model"),
-                "input": attributes.get("llm.input_messages.0.message.content", ""),
-                "output": attributes.get("llm.output_messages.0.message.content", ""),
-                "type": "reasoning",
-            }
-            calls.append(llm_info)
-        # Try to find tool calls
-        if "tool.name" in attributes or span.get("name", "").endswith("Tool"):
-            tool_info = {
-                "tool_name": attributes.get(
-                    "tool.name", span.get("name", "Unknown tool")
-                ),
-                "status": "success"
-                if span.get("status", {}).get("status_code") == "OK"
-                else "error",
-                "error": span.get("status", {}).get("description", None),
-            }
-            if "input.value" in attributes:
-                try:
-                    input_value = json.loads(attributes["input.value"])
-                    tool_info["input"] = input_value
-                except Exception:
-                    tool_info["input"] = attributes["input.value"]
-            if "output.value" in attributes:
-                tool_info["output"] = parse_generic_key_value_string(
-                    json.loads(attributes["output.value"])["output"]
-                )["content"]
-            calls.append(tool_info)
-    return calls
-def _extract_openai_data(telemetry: List[Dict[str, Any]]) -> list:
-    """Extract LLM calls and tool calls from OpenAI telemetry."""
-    calls = []
-    for span in telemetry:
-        if "attributes" not in span:
-            continue
-        attributes = span.get("attributes", {})
-        span_kind = attributes.get("openinference.span.kind", "")
-        # Collect LLM interactions - look for direct message content first
-        if span_kind == "LLM":
-            # Initialize the LLM info dictionary
-            span_info = {}
-            # Try to get input message
-            input_key = "llm.input_messages.1.message.content"  # User message is usually at index 1
-            if input_key in attributes:
-                span_info["input"] = attributes[input_key]
-            # Try to get output message directly
-            output_content = None
-            # Try in multiple possible locations
-            for key in [
-                "llm.output_messages.0.message.content",
-                "llm.output_messages.0.message.contents.0.message_content.text",
-            ]:
-                if key in attributes:
-                    output_content = attributes[key]
-                    break
-            # If we found direct output content, use it
-            if output_content:
-                span_info["output"] = output_content
-                calls.append(span_info)
-        elif span_kind == "TOOL":
-            tool_name = attributes.get("tool.name", "Unknown tool")
-            tool_output = attributes.get("output.value", "")
-            span_info = {
-                "tool_name": tool_name,
-                "input": attributes.get("input.value", ""),
-                "output": tool_output,
-                "status": span.get("status", {}).get("status_code"),
-            }
-            span_info["input"] = json.loads(span_info["input"])
-            calls.append(span_info)
-    return calls
-def _format_evidence(calls: List[Dict], agent_type: AgentType) -> str:
-    """Format extracted data into a standardized output format."""
-    evidence = f"## {agent_type.name} Agent Execution\n\n"
-    for idx, call in enumerate(calls, start=1):
-        evidence += f"### Call {idx}\n"
-        # Truncate any values that are too long
-        max_length = 400
-        call = {
-            k: (
-                v[:max_length] + "..."
-                if isinstance(v, str) and len(v) > max_length
-                else v
-            )
-            for k, v in call.items()
-        }
-        # Use ensure_ascii=False to prevent escaping Unicode characters
-        evidence += json.dumps(call, indent=2, ensure_ascii=False) + "\n\n"
-    return evidence

src/surf_spot_finder/evaluation/test_case.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Any
 from pydantic import BaseModel, Field, ConfigDict
 import yaml
@@ -13,14 +13,6 @@ class InputModel(BaseModel):
     json_tracer: bool
-class AgentModel(BaseModel):
-    model_id: str
-    api_key_var: str = "OPENAI_API_KEY"
-    api_base: Optional[str] = None
-    agent_type: str
-    tools: Optional[List[str]] = None
 class CheckpointCriteria(BaseModel):
     """Represents a checkpoint criteria with a description"""
@@ -32,20 +24,15 @@ class CheckpointCriteria(BaseModel):
 class TestCase(BaseModel):
     model_config = ConfigDict(extra="forbid")
     input: InputModel
-    agent: AgentModel
     ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
     checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
     final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
     @classmethod
-    def from_yaml(cls, test_case_path: str, agent_config_path: str) -> "TestCase":
         """Load a test case from a YAML file and process it"""
         with open(test_case_path, "r") as f:
             test_case_dict = yaml.safe_load(f)
-        with open(agent_config_path, "r") as f:
-            agent_config_dict = yaml.safe_load(f)
-        test_case_dict["agent"] = agent_config_dict["agent"]
         final_answer_criteria = []
         def add_gt_final_answer_criteria(ground_truth_list):

+from typing import Dict, List, Any
 from pydantic import BaseModel, Field, ConfigDict
 import yaml
     json_tracer: bool
 class CheckpointCriteria(BaseModel):
     """Represents a checkpoint criteria with a description"""
 class TestCase(BaseModel):
     model_config = ConfigDict(extra="forbid")
     input: InputModel
     ground_truth: List[Dict[str, Any]] = Field(default_factory=list)
     checkpoints: List[CheckpointCriteria] = Field(default_factory=list)
     final_answer_criteria: List[CheckpointCriteria] = Field(default_factory=list)
     @classmethod
+    def from_yaml(cls, test_case_path: str) -> "TestCase":
         """Load a test case from a YAML file and process it"""
         with open(test_case_path, "r") as f:
             test_case_dict = yaml.safe_load(f)
         final_answer_criteria = []
         def add_gt_final_answer_criteria(ground_truth_list):

src/surf_spot_finder/evaluation/utils.py CHANGED Viewed

@@ -4,36 +4,11 @@ import re
 from litellm import completion
 from textwrap import dedent
-from loguru import logger
 from pydantic import BaseModel, ConfigDict
-from surf_spot_finder.evaluation.telemetry_utils import extract_evidence
 from surf_spot_finder.evaluation.test_case import CheckpointCriteria
-from surf_spot_finder.agents import AgentType
-def determine_agent_type(trace: List[Dict[str, Any]]) -> AgentType:
-    """Determine the agent type based on the trace.
-    These are not really stable ways to find it, because we're waiting on some
-    reliable method for determining the agent type. This is a temporary solution.
-    """
-    for span in trace:
-        if "langchain" in span.get("attributes", {}).get("input.value", ""):
-            logger.info("Agent type is LANGCHAIN")
-            return AgentType.LANGCHAIN
-        if span.get("attributes", {}).get("smolagents.max_steps"):
-            logger.info("Agent type is SMOLAGENTS")
-            return AgentType.SMOLAGENTS
-        # This is extremely fragile but there currently isn't
-        # any specific key to indicate the agent type
-        if span.get("name") == "response":
-            logger.info("Agent type is OPENAI")
-            return AgentType.OPENAI
-    raise ValueError(
-        "Could not determine agent type from trace, or agent type not supported"
-    )
 class EvaluationResult(BaseModel):
     """Represents the result of evaluating a criterion"""
@@ -126,7 +101,7 @@ def verify_checkpoints(
     telemetry: List[Dict[str, Any]],
     checkpoints: List[CheckpointCriteria],
     model: str,
-    agent_type: AgentType,
 ) -> List[EvaluationResult]:
     """Verify each checkpoint against the telemetry data using LLM
     These checkpoints do not take the ground truth or hyupothesis
@@ -134,8 +109,7 @@ def verify_checkpoints(
     the specific criteria mentioned.
     """
     results = []
-    evidence = extract_evidence(telemetry, agent_type)
     print(evidence)
     for checkpoint in checkpoints:
         criteria = checkpoint.criteria

 from litellm import completion
 from textwrap import dedent
 from pydantic import BaseModel, ConfigDict
+from surf_spot_finder.evaluation.telemetry import TelemetryProcessor
 from surf_spot_finder.evaluation.test_case import CheckpointCriteria
 class EvaluationResult(BaseModel):
     """Represents the result of evaluating a criterion"""
     telemetry: List[Dict[str, Any]],
     checkpoints: List[CheckpointCriteria],
     model: str,
+    processor: TelemetryProcessor,
 ) -> List[EvaluationResult]:
     """Verify each checkpoint against the telemetry data using LLM
     These checkpoints do not take the ground truth or hyupothesis
     the specific criteria mentioned.
     """
     results = []
+    evidence = processor.extract_evidence(telemetry)
     print(evidence)
     for checkpoint in checkpoints:
         criteria = checkpoint.criteria