github-actions[bot] commited on
Commit
763ec84
Β·
1 Parent(s): 5ddb0f0

Sync with https://github.com/mozilla-ai/any-agent-demo

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#00d230"
3
+ backgroundColor="#FFFFFF"
4
+ secondaryBackgroundColor="#F0F2F6"
5
+ textColor="#161616"
6
+ font="sans serif"
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM python:3.9-slim
2
 
3
  WORKDIR /app
4
 
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  COPY requirements.txt ./
13
- COPY src/ ./src/
14
 
15
  RUN pip3 install -r requirements.txt
16
 
@@ -18,4 +18,4 @@ EXPOSE 8501
18
 
19
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
 
21
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
+ FROM python:3.12-slim
2
 
3
  WORKDIR /app
4
 
 
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  COPY requirements.txt ./
13
+ COPY . ./demo/
14
 
15
  RUN pip3 install -r requirements.txt
16
 
 
18
 
19
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
 
21
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md CHANGED
@@ -1,19 +1,20 @@
1
  ---
2
- title: Any Agent Demo
3
- emoji: πŸš€
4
- colorFrom: red
5
- colorTo: red
6
  sdk: docker
7
  app_port: 8501
8
  tags:
9
  - streamlit
10
  pinned: false
11
- short_description: Streamlit template space
 
12
  ---
13
 
14
  # Welcome to Streamlit!
15
 
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
 
18
  If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
  forums](https://discuss.streamlit.io).
 
1
  ---
2
+ title: Surf Spot Finder
3
+ emoji: πŸ„πŸΌβ€β™‚οΈ
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: docker
7
  app_port: 8501
8
  tags:
9
  - streamlit
10
  pinned: false
11
+ short_description: Find a surf spot near you
12
+ license: apache-2.0
13
  ---
14
 
15
  # Welcome to Streamlit!
16
 
17
+ Edit `/src/app.py` to customize this app to your heart's desire. :heart:
18
 
19
  If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
  forums](https://discuss.streamlit.io).
app.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from components.sidebar import ssf_sidebar
2
+ from constants import DEFAULT_TOOLS
3
+ import streamlit as st
4
+ import asyncio
5
+ import nest_asyncio
6
+ from services.agent import (
7
+ configure_agent,
8
+ display_evaluation_results,
9
+ display_output,
10
+ evaluate_agent,
11
+ run_agent,
12
+ )
13
+
14
+ nest_asyncio.apply()
15
+
16
+ # Set page config
17
+ st.set_page_config(page_title="Surf Spot Finder", page_icon="πŸ„", layout="wide")
18
+
19
+ # Allow a user to resize the sidebar to take up most of the screen to make editing eval cases easier
20
+ st.markdown(
21
+ """
22
+ <style>
23
+ /* When sidebar is expanded, adjust main content */
24
+ section[data-testid="stSidebar"][aria-expanded="true"] {
25
+ max-width: 99% !important;
26
+ }
27
+ </style>
28
+ """,
29
+ unsafe_allow_html=True,
30
+ )
31
+
32
+ with st.sidebar:
33
+ user_inputs = ssf_sidebar()
34
+ is_valid = user_inputs is not None
35
+ run_button = st.button("Run Agent πŸ€–", disabled=not is_valid, type="primary")
36
+
37
+
38
+ # Main content
39
+ async def main():
40
+ # Handle agent execution button click
41
+ if run_button:
42
+ agent, agent_config = await configure_agent(user_inputs)
43
+ agent_trace = await run_agent(agent, agent_config)
44
+
45
+ await display_output(agent_trace)
46
+
47
+ evaluation_result = await evaluate_agent(agent_config, agent_trace)
48
+
49
+ await display_evaluation_results(evaluation_result)
50
+ else:
51
+ st.title("πŸ„ Surf Spot Finder")
52
+ st.markdown(
53
+ "Find the best surfing spots based on your location and preferences! [Github Repo](https://github.com/mozilla-ai/surf-spot-finder)"
54
+ )
55
+ st.info(
56
+ "πŸ‘ˆ Configure your search parameters in the sidebar and click Run to start!"
57
+ )
58
+
59
+ # Display tools in a more organized way
60
+ st.markdown("### πŸ› οΈ Available Tools")
61
+
62
+ st.markdown("""
63
+ The AI Agent built for this project has a few tools available for use in order to find the perfect surf spot.
64
+ The agent is given the freedom to use (or not use) these tools in order to accomplish the task.
65
+ """)
66
+
67
+ weather_tools = [
68
+ tool
69
+ for tool in DEFAULT_TOOLS
70
+ if "forecast" in tool.__name__ or "weather" in tool.__name__
71
+ ]
72
+ for tool in weather_tools:
73
+ with st.expander(f"🌀️ {tool.__name__}"):
74
+ st.markdown(tool.__doc__ or "No description available")
75
+ location_tools = [
76
+ tool
77
+ for tool in DEFAULT_TOOLS
78
+ if "lat" in tool.__name__
79
+ or "lon" in tool.__name__
80
+ or "area" in tool.__name__
81
+ ]
82
+ for tool in location_tools:
83
+ with st.expander(f"πŸ“ {tool.__name__}"):
84
+ st.markdown(tool.__doc__ or "No description available")
85
+
86
+ web_tools = [
87
+ tool
88
+ for tool in DEFAULT_TOOLS
89
+ if "web" in tool.__name__ or "search" in tool.__name__
90
+ ]
91
+ for tool in web_tools:
92
+ with st.expander(f"🌐 {tool.__name__}"):
93
+ st.markdown(tool.__doc__ or "No description available")
94
+
95
+ # add a check that all tools were listed
96
+ if len(weather_tools) + len(location_tools) + len(web_tools) != len(
97
+ DEFAULT_TOOLS
98
+ ):
99
+ st.warning(
100
+ "Some tools are not listed. Please check the code for more details."
101
+ )
102
+
103
+ # Add Custom Evaluation explanation section
104
+ st.markdown("### πŸ“Š Custom Evaluation")
105
+ st.markdown("""
106
+ The Surf Spot Finder includes a powerful evaluation system that allows you to customize how the agent's performance is assessed.
107
+ You can find these settings in the sidebar under the "Custom Evaluation" expander.
108
+ """)
109
+
110
+ with st.expander("Learn more about Custom Evaluation"):
111
+ st.markdown("""
112
+ #### What is Custom Evaluation?
113
+ The Custom Evaluation feature uses an LLM-as-a-Judge approach to evaluate how well the agent performs its task.
114
+ An LLM will be given the complete agent trace (not just the final answer), and will assess the agent's performance based on the criteria you set.
115
+ You can customize:
116
+
117
+ - **Evaluation Model**: Choose which LLM should act as the judge
118
+ - **Evaluation Criteria**: Define specific checkpoints that the agent should meet
119
+ - **Scoring System**: Assign points to each criterion
120
+
121
+ #### How to Use Custom Evaluation
122
+
123
+ 1. **Select an Evaluation Model**: Choose which LLM you want to use as the judge
124
+ 2. **Edit Checkpoints**: Use the data editor to:
125
+ - Add new evaluation criteria
126
+ - Modify existing criteria
127
+ - Adjust point values
128
+ - Remove criteria you don't want to evaluate
129
+
130
+ #### Example Criteria
131
+ You can evaluate things like:
132
+ - Tool usage and success
133
+ - Order of operations
134
+ - Quality of final recommendations
135
+ - Response completeness
136
+ - Number of steps taken
137
+
138
+ #### Tips for Creating Good Evaluation Criteria
139
+ - Be specific about what you want to evaluate
140
+ - Use clear, unambiguous language
141
+ - Consider both process (how the agent works) and outcome (what it produces)
142
+ - Assign appropriate point values based on importance
143
+
144
+ The evaluation results will be displayed after each agent run, showing how well the agent met your custom criteria.
145
+ """)
146
+
147
+
148
+ if __name__ == "__main__":
149
+ loop = asyncio.new_event_loop()
150
+ loop.run_until_complete(main())
components/__init__.py ADDED
File without changes
components/agent_status.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from any_agent import AnyAgent
2
+ from opentelemetry.sdk.trace.export import SimpleSpanProcessor
3
+ from collections.abc import Sequence
4
+ from typing import TYPE_CHECKING, Callable
5
+
6
+ from opentelemetry.sdk.trace.export import (
7
+ SpanExporter,
8
+ SpanExportResult,
9
+ )
10
+
11
+ from any_agent import AgentFramework
12
+
13
+ from any_agent.tracing import TracingProcessor
14
+ from any_agent.tracing.trace import AgentSpan
15
+
16
+ if TYPE_CHECKING:
17
+ from opentelemetry.sdk.trace import ReadableSpan
18
+
19
+
20
+ class StreamlitExporter(SpanExporter):
21
+ """Build an `AgentTrace` and export to the different outputs."""
22
+
23
+ def __init__( # noqa: D107
24
+ self, agent_framework: AgentFramework, callback: Callable
25
+ ):
26
+ self.agent_framework = agent_framework
27
+ self.processor: TracingProcessor | None = TracingProcessor.create(
28
+ agent_framework
29
+ )
30
+ self.callback = callback
31
+
32
+ def export(self, spans: Sequence["ReadableSpan"]) -> SpanExportResult: # noqa: D102
33
+ if not self.processor:
34
+ return SpanExportResult.SUCCESS
35
+
36
+ for readable_span in spans:
37
+ # Check if this span belongs to our run
38
+ span = AgentSpan.from_readable_span(readable_span)
39
+ self.callback(span)
40
+
41
+ return SpanExportResult.SUCCESS
42
+
43
+
44
+ def export_logs(agent: AnyAgent, callback: Callable) -> None:
45
+ exporter = StreamlitExporter(agent.framework, callback)
46
+ span_processor = SimpleSpanProcessor(exporter)
47
+ agent._tracer_provider.add_span_processor(span_processor)
components/inputs.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timedelta
2
+ import json
3
+ import requests
4
+ import streamlit as st
5
+ from any_agent import AgentFramework
6
+ from any_agent.tracing.trace import _is_tracing_supported
7
+ from any_agent.evaluation import EvaluationCase
8
+ from any_agent.evaluation.schemas import CheckpointCriteria
9
+ import pandas as pd
10
+ from constants import DEFAULT_EVALUATION_CASE, MODEL_OPTIONS
11
+ import copy
12
+
13
+ from pydantic import BaseModel, ConfigDict
14
+
15
+
16
+ class UserInputs(BaseModel):
17
+ model_config = ConfigDict(extra="forbid")
18
+ model_id: str
19
+ location: str
20
+ max_driving_hours: int
21
+ date: datetime
22
+ framework: str
23
+ evaluation_case: EvaluationCase
24
+ run_evaluation: bool
25
+
26
+
27
+ @st.cache_resource
28
+ def get_area(area_name: str) -> dict:
29
+ """Get the area from Nominatim.
30
+
31
+ Uses the [Nominatim API](https://nominatim.org/release-docs/develop/api/Search/).
32
+
33
+ Args:
34
+ area_name (str): The name of the area.
35
+
36
+ Returns:
37
+ dict: The area found.
38
+ """
39
+ response = requests.get(
40
+ f"https://nominatim.openstreetmap.org/search?q={area_name}&format=json",
41
+ headers={"User-Agent": "Mozilla/5.0"},
42
+ timeout=5,
43
+ )
44
+ response.raise_for_status()
45
+ response_json = json.loads(response.content.decode())
46
+ return response_json
47
+
48
+
49
+ def get_user_inputs() -> UserInputs:
50
+ default_val = "Los Angeles California, US"
51
+
52
+ location = st.text_input("Enter a location", value=default_val)
53
+ if location:
54
+ location_check = get_area(location)
55
+ if not location_check:
56
+ st.error("❌ Invalid location")
57
+
58
+ max_driving_hours = st.number_input(
59
+ "Enter the maximum driving hours", min_value=1, value=2
60
+ )
61
+
62
+ col_date, col_time = st.columns([2, 1])
63
+ with col_date:
64
+ date = st.date_input(
65
+ "Select a date in the future", value=datetime.now() + timedelta(days=1)
66
+ )
67
+ with col_time:
68
+ # default to 9am
69
+ time = st.selectbox(
70
+ "Select a time",
71
+ [datetime.strptime(f"{i:02d}:00", "%H:%M").time() for i in range(24)],
72
+ index=9,
73
+ )
74
+ date = datetime.combine(date, time)
75
+
76
+ supported_frameworks = [
77
+ framework for framework in AgentFramework if _is_tracing_supported(framework)
78
+ ]
79
+
80
+ framework = st.selectbox(
81
+ "Select the agent framework to use",
82
+ supported_frameworks,
83
+ index=2,
84
+ format_func=lambda x: x.name,
85
+ )
86
+
87
+ model_id = st.selectbox(
88
+ "Select the model to use",
89
+ MODEL_OPTIONS,
90
+ index=1,
91
+ format_func=lambda x: "/".join(x.split("/")[-3:]),
92
+ )
93
+
94
+ # Add evaluation case section
95
+ with st.expander("Custom Evaluation"):
96
+ evaluation_model_id = st.selectbox(
97
+ "Select the model to use for LLM-as-a-Judge evaluation",
98
+ MODEL_OPTIONS,
99
+ index=2,
100
+ format_func=lambda x: "/".join(x.split("/")[-3:]),
101
+ )
102
+ evaluation_case = copy.deepcopy(DEFAULT_EVALUATION_CASE)
103
+ evaluation_case.llm_judge = evaluation_model_id
104
+ # make this an editable json section
105
+ # convert the checkpoints to a df series so that it can be edited
106
+ checkpoints = evaluation_case.checkpoints
107
+ checkpoints_df = pd.DataFrame(
108
+ [checkpoint.model_dump() for checkpoint in checkpoints]
109
+ )
110
+ checkpoints_df = st.data_editor(
111
+ checkpoints_df,
112
+ column_config={
113
+ "points": st.column_config.NumberColumn(label="Points"),
114
+ "criteria": st.column_config.TextColumn(label="Criteria"),
115
+ },
116
+ hide_index=True,
117
+ num_rows="dynamic",
118
+ )
119
+ # for each checkpoint, convert it back to a CheckpointCriteria object
120
+ new_ckpts = []
121
+
122
+ # don't let a user add more than 20 checkpoints
123
+ if len(checkpoints_df) > 20:
124
+ st.error(
125
+ "You can only add up to 20 checkpoints for the purpose of this demo."
126
+ )
127
+ checkpoints_df = checkpoints_df[:20]
128
+
129
+ for _, row in checkpoints_df.iterrows():
130
+ if row["criteria"] == "":
131
+ continue
132
+ try:
133
+ # Don't let people write essays for criteria in this demo
134
+ if len(row["criteria"].split(" ")) > 100:
135
+ raise ValueError("Criteria is too long")
136
+ new_crit = CheckpointCriteria(
137
+ criteria=row["criteria"], points=row["points"]
138
+ )
139
+ new_ckpts.append(new_crit)
140
+ except Exception as e:
141
+ st.error(f"Error creating checkpoint: {e}")
142
+ evaluation_case.checkpoints = new_ckpts
143
+
144
+ return UserInputs(
145
+ model_id=model_id,
146
+ location=location,
147
+ max_driving_hours=max_driving_hours,
148
+ date=date,
149
+ framework=framework,
150
+ evaluation_case=evaluation_case,
151
+ run_evaluation=st.checkbox("Run Evaluation", value=True),
152
+ )
components/sidebar.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from components.inputs import UserInputs, get_user_inputs
2
+ import streamlit as st
3
+
4
+
5
+ def ssf_sidebar() -> UserInputs:
6
+ st.markdown("### Configuration")
7
+ st.markdown("Built using [Any-Agent](https://github.com/mozilla-ai/any-agent)")
8
+ user_inputs = get_user_inputs()
9
+ return user_inputs
constants.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from any_agent.evaluation import EvaluationCase
4
+ from surf_spot_finder.tools import (
5
+ get_area_lat_lon,
6
+ get_wave_forecast,
7
+ get_wind_forecast,
8
+ )
9
+ from any_agent.logging import logger
10
+ from any_agent.tools.web_browsing import search_web, visit_webpage, search_tavily
11
+
12
+ MODEL_OPTIONS = [
13
+ # "huggingface/novita/deepseek-ai/DeepSeek-V3",
14
+ # "huggingface/novita/meta-llama/Llama-3.3-70B-Instruct",
15
+ "openai/gpt-4.1-nano",
16
+ "openai/gpt-4.1-mini",
17
+ "openai/gpt-4o",
18
+ "gemini/gemini-2.0-flash-lite",
19
+ "gemini/gemini-2.0-flash",
20
+ # "huggingface/Qwen/Qwen3-32B", # right now throwing an internal error, but novita qwen isn't supporting tool calling
21
+ ]
22
+
23
+ # Novita was the only HF based provider that worked.
24
+
25
+ # Hugginface API Provider Error:
26
+ # Must alternate between assistant/user, which meant that the 'tool' role made it puke
27
+
28
+
29
+ DEFAULT_EVALUATION_CASE = EvaluationCase(
30
+ llm_judge=MODEL_OPTIONS[0],
31
+ checkpoints=[
32
+ {
33
+ "criteria": "Check if the agent considered at least three surf spot options",
34
+ "points": 1,
35
+ },
36
+ {
37
+ "criteria": "Check if the agent gathered wind forecasts for each surf spot being evaluated.",
38
+ "points": 1,
39
+ },
40
+ {
41
+ "criteria": "Check if the agent gathered wave forecasts for each surf spot being evaluated.",
42
+ "points": 1,
43
+ },
44
+ {
45
+ "criteria": "Check if the agent used any web search tools to explore which surf spots should be considered",
46
+ "points": 1,
47
+ },
48
+ {
49
+ "criteria": "Check if the final answer contains any description about the weather (air temp, chance of rain, etc) at the chosen location",
50
+ "points": 1,
51
+ },
52
+ {
53
+ "criteria": "Check if the final answer includes one of the surf spots evaluated by tools",
54
+ "points": 1,
55
+ },
56
+ {
57
+ "criteria": "Check if the final answer includes information about some alternative surf spots if the user is not satisfied with the chosen one",
58
+ "points": 1,
59
+ },
60
+ ],
61
+ )
62
+
63
+
64
+ DEFAULT_TOOLS = [
65
+ get_wind_forecast,
66
+ get_wave_forecast,
67
+ get_area_lat_lon,
68
+ search_web,
69
+ visit_webpage,
70
+ ]
71
+ if os.getenv("TAVILY_API_KEY"):
72
+ DEFAULT_TOOLS.append(search_tavily)
73
+ else:
74
+ logger.warning("TAVILY_API_KEY not set, skipping Tavily search tool")
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
1
+ streamlit
2
+ openai-agents>=0.0.14
3
+ any-agent[all]==0.15.0
4
+ surf-spot-finder @ git+https://github.com/mozilla-ai/surf-spot-finder@7953016f71e7a96870233524b7a75878bd38f214
5
+ nest_asyncio
services/__init__.py ADDED
File without changes
services/agent.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from components.inputs import UserInputs
3
+ from constants import DEFAULT_TOOLS
4
+ from components.agent_status import export_logs
5
+ import streamlit as st
6
+ from surf_spot_finder.config import Config
7
+ from any_agent import AgentConfig, AnyAgent, TracingConfig, AgentFramework
8
+ from any_agent.tracing.trace import AgentTrace, AgentSpan
9
+ from any_agent.tracing.otel_types import StatusCode
10
+ from any_agent.evaluation import evaluate, TraceEvaluationResult
11
+
12
+
13
+ async def display_evaluation_results(result: TraceEvaluationResult):
14
+ if result.ground_truth_result is not None:
15
+ all_results = [*result.checkpoint_results, result.ground_truth_result]
16
+ else:
17
+ all_results = result.checkpoint_results
18
+
19
+ # Create columns for better layout
20
+ col1, col2 = st.columns(2)
21
+
22
+ with col1:
23
+ st.markdown("#### Criteria Results")
24
+ for checkpoint in all_results:
25
+ if checkpoint.passed:
26
+ st.success(f"βœ… {checkpoint.criteria}")
27
+ else:
28
+ st.error(f"❌ {checkpoint.criteria}")
29
+
30
+ with col2:
31
+ st.markdown("#### Overall Score")
32
+ total_points = sum([result.points for result in all_results])
33
+ if total_points == 0:
34
+ msg = "Total points is 0, cannot calculate score."
35
+ raise ValueError(msg)
36
+ passed_points = sum([result.points for result in all_results if result.passed])
37
+
38
+ # Create a nice score display
39
+ st.markdown(f"### {passed_points}/{total_points}")
40
+ percentage = (passed_points / total_points) * 100
41
+ st.progress(percentage / 100)
42
+ st.markdown(f"**{percentage:.1f}%**")
43
+
44
+
45
+ async def evaluate_agent(
46
+ config: Config, agent_trace: AgentTrace
47
+ ) -> TraceEvaluationResult:
48
+ assert (
49
+ len(config.evaluation_cases) == 1
50
+ ), "Only one evaluation case is supported in the demo"
51
+ st.markdown("### πŸ“Š Evaluation Results")
52
+
53
+ with st.spinner("Evaluating results..."):
54
+ case = config.evaluation_cases[0]
55
+ result: TraceEvaluationResult = evaluate(
56
+ evaluation_case=case,
57
+ trace=agent_trace,
58
+ agent_framework=config.framework,
59
+ )
60
+ return result
61
+
62
+
63
+ async def configure_agent(user_inputs: UserInputs) -> tuple[AnyAgent, Config]:
64
+ if "huggingface" in user_inputs.model_id:
65
+ model_args = {
66
+ "extra_headers": {"X-HF-Bill-To": "mozilla-ai"},
67
+ "temperature": 0.0,
68
+ }
69
+ else:
70
+ model_args = {}
71
+
72
+ if user_inputs.framework == AgentFramework.AGNO:
73
+ agent_args = {"tool_call_limit": 20}
74
+ else:
75
+ agent_args = {}
76
+
77
+ agent_config = AgentConfig(
78
+ model_id=user_inputs.model_id,
79
+ model_args=model_args,
80
+ agent_args=agent_args,
81
+ tools=DEFAULT_TOOLS,
82
+ )
83
+
84
+ config = Config(
85
+ location=user_inputs.location,
86
+ max_driving_hours=user_inputs.max_driving_hours,
87
+ date=user_inputs.date,
88
+ framework=user_inputs.framework,
89
+ main_agent=agent_config,
90
+ managed_agents=[],
91
+ evaluation_cases=[user_inputs.evaluation_case],
92
+ )
93
+
94
+ agent = await AnyAgent.create_async(
95
+ agent_framework=config.framework,
96
+ agent_config=config.main_agent,
97
+ managed_agents=config.managed_agents,
98
+ tracing=TracingConfig(console=True, cost_info=True),
99
+ )
100
+ return agent, config
101
+
102
+
103
+ async def display_output(agent_trace: AgentTrace):
104
+ # Display the agent trace in a more organized way
105
+ with st.expander("### 🧩 Agent Trace"):
106
+ for span in agent_trace.spans:
107
+ # Header with name and status
108
+ col1, col2 = st.columns([4, 1])
109
+ with col1:
110
+ st.markdown(f"**{span.name}**")
111
+ if span.attributes:
112
+ # st.json(span.attributes, expanded=False)
113
+ if "input.value" in span.attributes:
114
+ try:
115
+ input_value = json.loads(span.attributes["input.value"])
116
+ if isinstance(input_value, list) and len(input_value) > 0:
117
+ st.write(f"Input: {input_value[-1]}")
118
+ else:
119
+ st.write(f"Input: {input_value}")
120
+ except Exception: # noqa: E722
121
+ st.write(f"Input: {span.attributes['input.value']}")
122
+ if "output.value" in span.attributes:
123
+ try:
124
+ output_value = json.loads(span.attributes["output.value"])
125
+ if isinstance(output_value, list) and len(output_value) > 0:
126
+ st.write(f"Output: {output_value[-1]}")
127
+ else:
128
+ st.write(f"Output: {output_value}")
129
+ except Exception: # noqa: E722
130
+ st.write(f"Output: {span.attributes['output.value']}")
131
+ with col2:
132
+ status_color = (
133
+ "green" if span.status.status_code == StatusCode.OK else "red"
134
+ )
135
+ st.markdown(
136
+ f"<span style='color: {status_color}'>● {span.status.status_code.name}</span>",
137
+ unsafe_allow_html=True,
138
+ )
139
+
140
+ with st.expander("### πŸ„ Results", expanded=True):
141
+ time_col, cost_col, tokens_col = st.columns(3)
142
+ duration = agent_trace.duration.total_seconds()
143
+ with time_col:
144
+ st.info(f"⏱️ Execution Time: {duration:0.2f} seconds")
145
+ with cost_col:
146
+ st.info(f"πŸ’° Estimated Cost: ${agent_trace.cost.total_cost:.6f}")
147
+ with tokens_col:
148
+ st.info(f"πŸ“¦ Total Tokens: {agent_trace.usage.total_tokens:,}")
149
+ st.markdown("#### Final Output")
150
+ st.info(agent_trace.final_output)
151
+
152
+
153
+ async def run_agent(agent, config) -> AgentTrace:
154
+ st.markdown("#### πŸ” Running Surf Spot Finder with query")
155
+
156
+ query = config.input_prompt_template.format(
157
+ LOCATION=config.location,
158
+ MAX_DRIVING_HOURS=config.max_driving_hours,
159
+ DATE=config.date,
160
+ )
161
+
162
+ st.code(query, language="text")
163
+ kwargs = {}
164
+ if (
165
+ config.framework == AgentFramework.OPENAI
166
+ or config.framework == AgentFramework.TINYAGENT
167
+ ):
168
+ kwargs["max_turns"] = 20
169
+ elif config.framework == AgentFramework.SMOLAGENTS:
170
+ kwargs["max_steps"] = 20
171
+ if config.framework == AgentFramework.LANGCHAIN:
172
+ from langchain_core.runnables import RunnableConfig
173
+
174
+ kwargs["config"] = RunnableConfig(recursion_limit=20)
175
+ elif config.framework == AgentFramework.GOOGLE:
176
+ from google.adk.agents.run_config import RunConfig
177
+
178
+ kwargs["run_config"] = RunConfig(max_llm_calls=20)
179
+
180
+ with st.status("Agent is running...", expanded=False, state="running") as status:
181
+
182
+ def update_span(span: AgentSpan):
183
+ # Process input value
184
+ input_value = span.attributes.get("input.value", "")
185
+ if input_value:
186
+ try:
187
+ parsed_input = json.loads(input_value)
188
+ if isinstance(parsed_input, list) and len(parsed_input) > 0:
189
+ input_value = str(parsed_input[-1])
190
+ except Exception:
191
+ pass
192
+
193
+ # Process output value
194
+ output_value = span.attributes.get("output.value", "")
195
+ if output_value:
196
+ try:
197
+ parsed_output = json.loads(output_value)
198
+ if isinstance(parsed_output, list) and len(parsed_output) > 0:
199
+ output_value = str(parsed_output[-1])
200
+ except Exception:
201
+ pass
202
+
203
+ # Truncate long values
204
+ max_length = 800
205
+ if len(input_value) > max_length:
206
+ input_value = f"[Truncated]...{input_value[-max_length:]}"
207
+ if len(output_value) > max_length:
208
+ output_value = f"[Truncated]...{output_value[-max_length:]}"
209
+
210
+ # Create a cleaner message format
211
+ if input_value or output_value:
212
+ message = f"Step: {span.name}\n"
213
+ if input_value:
214
+ message += f"Input: {input_value}\n"
215
+ if output_value:
216
+ message += f"Output: {output_value}"
217
+ else:
218
+ message = f"Step: {span.name}\n{span}"
219
+
220
+ status.update(label=message, expanded=False, state="running")
221
+
222
+ export_logs(agent, update_span)
223
+ agent_trace: AgentTrace = await agent.run_async(query, **kwargs)
224
+ status.update(label="Finished!", expanded=False, state="complete")
225
+
226
+ agent.exit()
227
+ return agent_trace