Spaces:
Running
Running
github-actions[bot]
commited on
Commit
Β·
763ec84
1
Parent(s):
5ddb0f0
Sync with https://github.com/mozilla-ai/any-agent-demo
Browse files- .streamlit/config.toml +6 -0
- Dockerfile +3 -3
- README.md +7 -6
- app.py +150 -0
- components/__init__.py +0 -0
- components/agent_status.py +47 -0
- components/inputs.py +152 -0
- components/sidebar.py +9 -0
- constants.py +74 -0
- requirements.txt +5 -3
- services/__init__.py +0 -0
- services/agent.py +227 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
primaryColor="#00d230"
|
3 |
+
backgroundColor="#FFFFFF"
|
4 |
+
secondaryBackgroundColor="#F0F2F6"
|
5 |
+
textColor="#161616"
|
6 |
+
font="sans serif"
|
Dockerfile
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
FROM python:3.
|
2 |
|
3 |
WORKDIR /app
|
4 |
|
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y \
|
|
10 |
&& rm -rf /var/lib/apt/lists/*
|
11 |
|
12 |
COPY requirements.txt ./
|
13 |
-
COPY
|
14 |
|
15 |
RUN pip3 install -r requirements.txt
|
16 |
|
@@ -18,4 +18,4 @@ EXPOSE 8501
|
|
18 |
|
19 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
20 |
|
21 |
-
ENTRYPOINT ["streamlit", "run", "
|
|
|
1 |
+
FROM python:3.12-slim
|
2 |
|
3 |
WORKDIR /app
|
4 |
|
|
|
10 |
&& rm -rf /var/lib/apt/lists/*
|
11 |
|
12 |
COPY requirements.txt ./
|
13 |
+
COPY . ./demo/
|
14 |
|
15 |
RUN pip3 install -r requirements.txt
|
16 |
|
|
|
18 |
|
19 |
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
20 |
|
21 |
+
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
README.md
CHANGED
@@ -1,19 +1,20 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
app_port: 8501
|
8 |
tags:
|
9 |
- streamlit
|
10 |
pinned: false
|
11 |
-
short_description:
|
|
|
12 |
---
|
13 |
|
14 |
# Welcome to Streamlit!
|
15 |
|
16 |
-
Edit `/src/
|
17 |
|
18 |
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
19 |
forums](https://discuss.streamlit.io).
|
|
|
1 |
---
|
2 |
+
title: Surf Spot Finder
|
3 |
+
emoji: ππΌββοΈ
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: indigo
|
6 |
sdk: docker
|
7 |
app_port: 8501
|
8 |
tags:
|
9 |
- streamlit
|
10 |
pinned: false
|
11 |
+
short_description: Find a surf spot near you
|
12 |
+
license: apache-2.0
|
13 |
---
|
14 |
|
15 |
# Welcome to Streamlit!
|
16 |
|
17 |
+
Edit `/src/app.py` to customize this app to your heart's desire. :heart:
|
18 |
|
19 |
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
20 |
forums](https://discuss.streamlit.io).
|
app.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from components.sidebar import ssf_sidebar
|
2 |
+
from constants import DEFAULT_TOOLS
|
3 |
+
import streamlit as st
|
4 |
+
import asyncio
|
5 |
+
import nest_asyncio
|
6 |
+
from services.agent import (
|
7 |
+
configure_agent,
|
8 |
+
display_evaluation_results,
|
9 |
+
display_output,
|
10 |
+
evaluate_agent,
|
11 |
+
run_agent,
|
12 |
+
)
|
13 |
+
|
14 |
+
nest_asyncio.apply()
|
15 |
+
|
16 |
+
# Set page config
|
17 |
+
st.set_page_config(page_title="Surf Spot Finder", page_icon="π", layout="wide")
|
18 |
+
|
19 |
+
# Allow a user to resize the sidebar to take up most of the screen to make editing eval cases easier
|
20 |
+
st.markdown(
|
21 |
+
"""
|
22 |
+
<style>
|
23 |
+
/* When sidebar is expanded, adjust main content */
|
24 |
+
section[data-testid="stSidebar"][aria-expanded="true"] {
|
25 |
+
max-width: 99% !important;
|
26 |
+
}
|
27 |
+
</style>
|
28 |
+
""",
|
29 |
+
unsafe_allow_html=True,
|
30 |
+
)
|
31 |
+
|
32 |
+
with st.sidebar:
|
33 |
+
user_inputs = ssf_sidebar()
|
34 |
+
is_valid = user_inputs is not None
|
35 |
+
run_button = st.button("Run Agent π€", disabled=not is_valid, type="primary")
|
36 |
+
|
37 |
+
|
38 |
+
# Main content
|
39 |
+
async def main():
|
40 |
+
# Handle agent execution button click
|
41 |
+
if run_button:
|
42 |
+
agent, agent_config = await configure_agent(user_inputs)
|
43 |
+
agent_trace = await run_agent(agent, agent_config)
|
44 |
+
|
45 |
+
await display_output(agent_trace)
|
46 |
+
|
47 |
+
evaluation_result = await evaluate_agent(agent_config, agent_trace)
|
48 |
+
|
49 |
+
await display_evaluation_results(evaluation_result)
|
50 |
+
else:
|
51 |
+
st.title("π Surf Spot Finder")
|
52 |
+
st.markdown(
|
53 |
+
"Find the best surfing spots based on your location and preferences! [Github Repo](https://github.com/mozilla-ai/surf-spot-finder)"
|
54 |
+
)
|
55 |
+
st.info(
|
56 |
+
"π Configure your search parameters in the sidebar and click Run to start!"
|
57 |
+
)
|
58 |
+
|
59 |
+
# Display tools in a more organized way
|
60 |
+
st.markdown("### π οΈ Available Tools")
|
61 |
+
|
62 |
+
st.markdown("""
|
63 |
+
The AI Agent built for this project has a few tools available for use in order to find the perfect surf spot.
|
64 |
+
The agent is given the freedom to use (or not use) these tools in order to accomplish the task.
|
65 |
+
""")
|
66 |
+
|
67 |
+
weather_tools = [
|
68 |
+
tool
|
69 |
+
for tool in DEFAULT_TOOLS
|
70 |
+
if "forecast" in tool.__name__ or "weather" in tool.__name__
|
71 |
+
]
|
72 |
+
for tool in weather_tools:
|
73 |
+
with st.expander(f"π€οΈ {tool.__name__}"):
|
74 |
+
st.markdown(tool.__doc__ or "No description available")
|
75 |
+
location_tools = [
|
76 |
+
tool
|
77 |
+
for tool in DEFAULT_TOOLS
|
78 |
+
if "lat" in tool.__name__
|
79 |
+
or "lon" in tool.__name__
|
80 |
+
or "area" in tool.__name__
|
81 |
+
]
|
82 |
+
for tool in location_tools:
|
83 |
+
with st.expander(f"π {tool.__name__}"):
|
84 |
+
st.markdown(tool.__doc__ or "No description available")
|
85 |
+
|
86 |
+
web_tools = [
|
87 |
+
tool
|
88 |
+
for tool in DEFAULT_TOOLS
|
89 |
+
if "web" in tool.__name__ or "search" in tool.__name__
|
90 |
+
]
|
91 |
+
for tool in web_tools:
|
92 |
+
with st.expander(f"π {tool.__name__}"):
|
93 |
+
st.markdown(tool.__doc__ or "No description available")
|
94 |
+
|
95 |
+
# add a check that all tools were listed
|
96 |
+
if len(weather_tools) + len(location_tools) + len(web_tools) != len(
|
97 |
+
DEFAULT_TOOLS
|
98 |
+
):
|
99 |
+
st.warning(
|
100 |
+
"Some tools are not listed. Please check the code for more details."
|
101 |
+
)
|
102 |
+
|
103 |
+
# Add Custom Evaluation explanation section
|
104 |
+
st.markdown("### π Custom Evaluation")
|
105 |
+
st.markdown("""
|
106 |
+
The Surf Spot Finder includes a powerful evaluation system that allows you to customize how the agent's performance is assessed.
|
107 |
+
You can find these settings in the sidebar under the "Custom Evaluation" expander.
|
108 |
+
""")
|
109 |
+
|
110 |
+
with st.expander("Learn more about Custom Evaluation"):
|
111 |
+
st.markdown("""
|
112 |
+
#### What is Custom Evaluation?
|
113 |
+
The Custom Evaluation feature uses an LLM-as-a-Judge approach to evaluate how well the agent performs its task.
|
114 |
+
An LLM will be given the complete agent trace (not just the final answer), and will assess the agent's performance based on the criteria you set.
|
115 |
+
You can customize:
|
116 |
+
|
117 |
+
- **Evaluation Model**: Choose which LLM should act as the judge
|
118 |
+
- **Evaluation Criteria**: Define specific checkpoints that the agent should meet
|
119 |
+
- **Scoring System**: Assign points to each criterion
|
120 |
+
|
121 |
+
#### How to Use Custom Evaluation
|
122 |
+
|
123 |
+
1. **Select an Evaluation Model**: Choose which LLM you want to use as the judge
|
124 |
+
2. **Edit Checkpoints**: Use the data editor to:
|
125 |
+
- Add new evaluation criteria
|
126 |
+
- Modify existing criteria
|
127 |
+
- Adjust point values
|
128 |
+
- Remove criteria you don't want to evaluate
|
129 |
+
|
130 |
+
#### Example Criteria
|
131 |
+
You can evaluate things like:
|
132 |
+
- Tool usage and success
|
133 |
+
- Order of operations
|
134 |
+
- Quality of final recommendations
|
135 |
+
- Response completeness
|
136 |
+
- Number of steps taken
|
137 |
+
|
138 |
+
#### Tips for Creating Good Evaluation Criteria
|
139 |
+
- Be specific about what you want to evaluate
|
140 |
+
- Use clear, unambiguous language
|
141 |
+
- Consider both process (how the agent works) and outcome (what it produces)
|
142 |
+
- Assign appropriate point values based on importance
|
143 |
+
|
144 |
+
The evaluation results will be displayed after each agent run, showing how well the agent met your custom criteria.
|
145 |
+
""")
|
146 |
+
|
147 |
+
|
148 |
+
if __name__ == "__main__":
|
149 |
+
loop = asyncio.new_event_loop()
|
150 |
+
loop.run_until_complete(main())
|
components/__init__.py
ADDED
File without changes
|
components/agent_status.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from any_agent import AnyAgent
|
2 |
+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
3 |
+
from collections.abc import Sequence
|
4 |
+
from typing import TYPE_CHECKING, Callable
|
5 |
+
|
6 |
+
from opentelemetry.sdk.trace.export import (
|
7 |
+
SpanExporter,
|
8 |
+
SpanExportResult,
|
9 |
+
)
|
10 |
+
|
11 |
+
from any_agent import AgentFramework
|
12 |
+
|
13 |
+
from any_agent.tracing import TracingProcessor
|
14 |
+
from any_agent.tracing.trace import AgentSpan
|
15 |
+
|
16 |
+
if TYPE_CHECKING:
|
17 |
+
from opentelemetry.sdk.trace import ReadableSpan
|
18 |
+
|
19 |
+
|
20 |
+
class StreamlitExporter(SpanExporter):
|
21 |
+
"""Build an `AgentTrace` and export to the different outputs."""
|
22 |
+
|
23 |
+
def __init__( # noqa: D107
|
24 |
+
self, agent_framework: AgentFramework, callback: Callable
|
25 |
+
):
|
26 |
+
self.agent_framework = agent_framework
|
27 |
+
self.processor: TracingProcessor | None = TracingProcessor.create(
|
28 |
+
agent_framework
|
29 |
+
)
|
30 |
+
self.callback = callback
|
31 |
+
|
32 |
+
def export(self, spans: Sequence["ReadableSpan"]) -> SpanExportResult: # noqa: D102
|
33 |
+
if not self.processor:
|
34 |
+
return SpanExportResult.SUCCESS
|
35 |
+
|
36 |
+
for readable_span in spans:
|
37 |
+
# Check if this span belongs to our run
|
38 |
+
span = AgentSpan.from_readable_span(readable_span)
|
39 |
+
self.callback(span)
|
40 |
+
|
41 |
+
return SpanExportResult.SUCCESS
|
42 |
+
|
43 |
+
|
44 |
+
def export_logs(agent: AnyAgent, callback: Callable) -> None:
|
45 |
+
exporter = StreamlitExporter(agent.framework, callback)
|
46 |
+
span_processor = SimpleSpanProcessor(exporter)
|
47 |
+
agent._tracer_provider.add_span_processor(span_processor)
|
components/inputs.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime, timedelta
|
2 |
+
import json
|
3 |
+
import requests
|
4 |
+
import streamlit as st
|
5 |
+
from any_agent import AgentFramework
|
6 |
+
from any_agent.tracing.trace import _is_tracing_supported
|
7 |
+
from any_agent.evaluation import EvaluationCase
|
8 |
+
from any_agent.evaluation.schemas import CheckpointCriteria
|
9 |
+
import pandas as pd
|
10 |
+
from constants import DEFAULT_EVALUATION_CASE, MODEL_OPTIONS
|
11 |
+
import copy
|
12 |
+
|
13 |
+
from pydantic import BaseModel, ConfigDict
|
14 |
+
|
15 |
+
|
16 |
+
class UserInputs(BaseModel):
|
17 |
+
model_config = ConfigDict(extra="forbid")
|
18 |
+
model_id: str
|
19 |
+
location: str
|
20 |
+
max_driving_hours: int
|
21 |
+
date: datetime
|
22 |
+
framework: str
|
23 |
+
evaluation_case: EvaluationCase
|
24 |
+
run_evaluation: bool
|
25 |
+
|
26 |
+
|
27 |
+
@st.cache_resource
|
28 |
+
def get_area(area_name: str) -> dict:
|
29 |
+
"""Get the area from Nominatim.
|
30 |
+
|
31 |
+
Uses the [Nominatim API](https://nominatim.org/release-docs/develop/api/Search/).
|
32 |
+
|
33 |
+
Args:
|
34 |
+
area_name (str): The name of the area.
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
dict: The area found.
|
38 |
+
"""
|
39 |
+
response = requests.get(
|
40 |
+
f"https://nominatim.openstreetmap.org/search?q={area_name}&format=json",
|
41 |
+
headers={"User-Agent": "Mozilla/5.0"},
|
42 |
+
timeout=5,
|
43 |
+
)
|
44 |
+
response.raise_for_status()
|
45 |
+
response_json = json.loads(response.content.decode())
|
46 |
+
return response_json
|
47 |
+
|
48 |
+
|
49 |
+
def get_user_inputs() -> UserInputs:
|
50 |
+
default_val = "Los Angeles California, US"
|
51 |
+
|
52 |
+
location = st.text_input("Enter a location", value=default_val)
|
53 |
+
if location:
|
54 |
+
location_check = get_area(location)
|
55 |
+
if not location_check:
|
56 |
+
st.error("β Invalid location")
|
57 |
+
|
58 |
+
max_driving_hours = st.number_input(
|
59 |
+
"Enter the maximum driving hours", min_value=1, value=2
|
60 |
+
)
|
61 |
+
|
62 |
+
col_date, col_time = st.columns([2, 1])
|
63 |
+
with col_date:
|
64 |
+
date = st.date_input(
|
65 |
+
"Select a date in the future", value=datetime.now() + timedelta(days=1)
|
66 |
+
)
|
67 |
+
with col_time:
|
68 |
+
# default to 9am
|
69 |
+
time = st.selectbox(
|
70 |
+
"Select a time",
|
71 |
+
[datetime.strptime(f"{i:02d}:00", "%H:%M").time() for i in range(24)],
|
72 |
+
index=9,
|
73 |
+
)
|
74 |
+
date = datetime.combine(date, time)
|
75 |
+
|
76 |
+
supported_frameworks = [
|
77 |
+
framework for framework in AgentFramework if _is_tracing_supported(framework)
|
78 |
+
]
|
79 |
+
|
80 |
+
framework = st.selectbox(
|
81 |
+
"Select the agent framework to use",
|
82 |
+
supported_frameworks,
|
83 |
+
index=2,
|
84 |
+
format_func=lambda x: x.name,
|
85 |
+
)
|
86 |
+
|
87 |
+
model_id = st.selectbox(
|
88 |
+
"Select the model to use",
|
89 |
+
MODEL_OPTIONS,
|
90 |
+
index=1,
|
91 |
+
format_func=lambda x: "/".join(x.split("/")[-3:]),
|
92 |
+
)
|
93 |
+
|
94 |
+
# Add evaluation case section
|
95 |
+
with st.expander("Custom Evaluation"):
|
96 |
+
evaluation_model_id = st.selectbox(
|
97 |
+
"Select the model to use for LLM-as-a-Judge evaluation",
|
98 |
+
MODEL_OPTIONS,
|
99 |
+
index=2,
|
100 |
+
format_func=lambda x: "/".join(x.split("/")[-3:]),
|
101 |
+
)
|
102 |
+
evaluation_case = copy.deepcopy(DEFAULT_EVALUATION_CASE)
|
103 |
+
evaluation_case.llm_judge = evaluation_model_id
|
104 |
+
# make this an editable json section
|
105 |
+
# convert the checkpoints to a df series so that it can be edited
|
106 |
+
checkpoints = evaluation_case.checkpoints
|
107 |
+
checkpoints_df = pd.DataFrame(
|
108 |
+
[checkpoint.model_dump() for checkpoint in checkpoints]
|
109 |
+
)
|
110 |
+
checkpoints_df = st.data_editor(
|
111 |
+
checkpoints_df,
|
112 |
+
column_config={
|
113 |
+
"points": st.column_config.NumberColumn(label="Points"),
|
114 |
+
"criteria": st.column_config.TextColumn(label="Criteria"),
|
115 |
+
},
|
116 |
+
hide_index=True,
|
117 |
+
num_rows="dynamic",
|
118 |
+
)
|
119 |
+
# for each checkpoint, convert it back to a CheckpointCriteria object
|
120 |
+
new_ckpts = []
|
121 |
+
|
122 |
+
# don't let a user add more than 20 checkpoints
|
123 |
+
if len(checkpoints_df) > 20:
|
124 |
+
st.error(
|
125 |
+
"You can only add up to 20 checkpoints for the purpose of this demo."
|
126 |
+
)
|
127 |
+
checkpoints_df = checkpoints_df[:20]
|
128 |
+
|
129 |
+
for _, row in checkpoints_df.iterrows():
|
130 |
+
if row["criteria"] == "":
|
131 |
+
continue
|
132 |
+
try:
|
133 |
+
# Don't let people write essays for criteria in this demo
|
134 |
+
if len(row["criteria"].split(" ")) > 100:
|
135 |
+
raise ValueError("Criteria is too long")
|
136 |
+
new_crit = CheckpointCriteria(
|
137 |
+
criteria=row["criteria"], points=row["points"]
|
138 |
+
)
|
139 |
+
new_ckpts.append(new_crit)
|
140 |
+
except Exception as e:
|
141 |
+
st.error(f"Error creating checkpoint: {e}")
|
142 |
+
evaluation_case.checkpoints = new_ckpts
|
143 |
+
|
144 |
+
return UserInputs(
|
145 |
+
model_id=model_id,
|
146 |
+
location=location,
|
147 |
+
max_driving_hours=max_driving_hours,
|
148 |
+
date=date,
|
149 |
+
framework=framework,
|
150 |
+
evaluation_case=evaluation_case,
|
151 |
+
run_evaluation=st.checkbox("Run Evaluation", value=True),
|
152 |
+
)
|
components/sidebar.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from components.inputs import UserInputs, get_user_inputs
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
|
5 |
+
def ssf_sidebar() -> UserInputs:
|
6 |
+
st.markdown("### Configuration")
|
7 |
+
st.markdown("Built using [Any-Agent](https://github.com/mozilla-ai/any-agent)")
|
8 |
+
user_inputs = get_user_inputs()
|
9 |
+
return user_inputs
|
constants.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from any_agent.evaluation import EvaluationCase
|
4 |
+
from surf_spot_finder.tools import (
|
5 |
+
get_area_lat_lon,
|
6 |
+
get_wave_forecast,
|
7 |
+
get_wind_forecast,
|
8 |
+
)
|
9 |
+
from any_agent.logging import logger
|
10 |
+
from any_agent.tools.web_browsing import search_web, visit_webpage, search_tavily
|
11 |
+
|
12 |
+
MODEL_OPTIONS = [
|
13 |
+
# "huggingface/novita/deepseek-ai/DeepSeek-V3",
|
14 |
+
# "huggingface/novita/meta-llama/Llama-3.3-70B-Instruct",
|
15 |
+
"openai/gpt-4.1-nano",
|
16 |
+
"openai/gpt-4.1-mini",
|
17 |
+
"openai/gpt-4o",
|
18 |
+
"gemini/gemini-2.0-flash-lite",
|
19 |
+
"gemini/gemini-2.0-flash",
|
20 |
+
# "huggingface/Qwen/Qwen3-32B", # right now throwing an internal error, but novita qwen isn't supporting tool calling
|
21 |
+
]
|
22 |
+
|
23 |
+
# Novita was the only HF based provider that worked.
|
24 |
+
|
25 |
+
# Hugginface API Provider Error:
|
26 |
+
# Must alternate between assistant/user, which meant that the 'tool' role made it puke
|
27 |
+
|
28 |
+
|
29 |
+
DEFAULT_EVALUATION_CASE = EvaluationCase(
|
30 |
+
llm_judge=MODEL_OPTIONS[0],
|
31 |
+
checkpoints=[
|
32 |
+
{
|
33 |
+
"criteria": "Check if the agent considered at least three surf spot options",
|
34 |
+
"points": 1,
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"criteria": "Check if the agent gathered wind forecasts for each surf spot being evaluated.",
|
38 |
+
"points": 1,
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"criteria": "Check if the agent gathered wave forecasts for each surf spot being evaluated.",
|
42 |
+
"points": 1,
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"criteria": "Check if the agent used any web search tools to explore which surf spots should be considered",
|
46 |
+
"points": 1,
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"criteria": "Check if the final answer contains any description about the weather (air temp, chance of rain, etc) at the chosen location",
|
50 |
+
"points": 1,
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"criteria": "Check if the final answer includes one of the surf spots evaluated by tools",
|
54 |
+
"points": 1,
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"criteria": "Check if the final answer includes information about some alternative surf spots if the user is not satisfied with the chosen one",
|
58 |
+
"points": 1,
|
59 |
+
},
|
60 |
+
],
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
DEFAULT_TOOLS = [
|
65 |
+
get_wind_forecast,
|
66 |
+
get_wave_forecast,
|
67 |
+
get_area_lat_lon,
|
68 |
+
search_web,
|
69 |
+
visit_webpage,
|
70 |
+
]
|
71 |
+
if os.getenv("TAVILY_API_KEY"):
|
72 |
+
DEFAULT_TOOLS.append(search_tavily)
|
73 |
+
else:
|
74 |
+
logger.warning("TAVILY_API_KEY not set, skipping Tavily search tool")
|
requirements.txt
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
openai-agents>=0.0.14
|
3 |
+
any-agent[all]==0.15.0
|
4 |
+
surf-spot-finder @ git+https://github.com/mozilla-ai/surf-spot-finder@7953016f71e7a96870233524b7a75878bd38f214
|
5 |
+
nest_asyncio
|
services/__init__.py
ADDED
File without changes
|
services/agent.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from components.inputs import UserInputs
|
3 |
+
from constants import DEFAULT_TOOLS
|
4 |
+
from components.agent_status import export_logs
|
5 |
+
import streamlit as st
|
6 |
+
from surf_spot_finder.config import Config
|
7 |
+
from any_agent import AgentConfig, AnyAgent, TracingConfig, AgentFramework
|
8 |
+
from any_agent.tracing.trace import AgentTrace, AgentSpan
|
9 |
+
from any_agent.tracing.otel_types import StatusCode
|
10 |
+
from any_agent.evaluation import evaluate, TraceEvaluationResult
|
11 |
+
|
12 |
+
|
13 |
+
async def display_evaluation_results(result: TraceEvaluationResult):
|
14 |
+
if result.ground_truth_result is not None:
|
15 |
+
all_results = [*result.checkpoint_results, result.ground_truth_result]
|
16 |
+
else:
|
17 |
+
all_results = result.checkpoint_results
|
18 |
+
|
19 |
+
# Create columns for better layout
|
20 |
+
col1, col2 = st.columns(2)
|
21 |
+
|
22 |
+
with col1:
|
23 |
+
st.markdown("#### Criteria Results")
|
24 |
+
for checkpoint in all_results:
|
25 |
+
if checkpoint.passed:
|
26 |
+
st.success(f"β
{checkpoint.criteria}")
|
27 |
+
else:
|
28 |
+
st.error(f"β {checkpoint.criteria}")
|
29 |
+
|
30 |
+
with col2:
|
31 |
+
st.markdown("#### Overall Score")
|
32 |
+
total_points = sum([result.points for result in all_results])
|
33 |
+
if total_points == 0:
|
34 |
+
msg = "Total points is 0, cannot calculate score."
|
35 |
+
raise ValueError(msg)
|
36 |
+
passed_points = sum([result.points for result in all_results if result.passed])
|
37 |
+
|
38 |
+
# Create a nice score display
|
39 |
+
st.markdown(f"### {passed_points}/{total_points}")
|
40 |
+
percentage = (passed_points / total_points) * 100
|
41 |
+
st.progress(percentage / 100)
|
42 |
+
st.markdown(f"**{percentage:.1f}%**")
|
43 |
+
|
44 |
+
|
45 |
+
async def evaluate_agent(
|
46 |
+
config: Config, agent_trace: AgentTrace
|
47 |
+
) -> TraceEvaluationResult:
|
48 |
+
assert (
|
49 |
+
len(config.evaluation_cases) == 1
|
50 |
+
), "Only one evaluation case is supported in the demo"
|
51 |
+
st.markdown("### π Evaluation Results")
|
52 |
+
|
53 |
+
with st.spinner("Evaluating results..."):
|
54 |
+
case = config.evaluation_cases[0]
|
55 |
+
result: TraceEvaluationResult = evaluate(
|
56 |
+
evaluation_case=case,
|
57 |
+
trace=agent_trace,
|
58 |
+
agent_framework=config.framework,
|
59 |
+
)
|
60 |
+
return result
|
61 |
+
|
62 |
+
|
63 |
+
async def configure_agent(user_inputs: UserInputs) -> tuple[AnyAgent, Config]:
|
64 |
+
if "huggingface" in user_inputs.model_id:
|
65 |
+
model_args = {
|
66 |
+
"extra_headers": {"X-HF-Bill-To": "mozilla-ai"},
|
67 |
+
"temperature": 0.0,
|
68 |
+
}
|
69 |
+
else:
|
70 |
+
model_args = {}
|
71 |
+
|
72 |
+
if user_inputs.framework == AgentFramework.AGNO:
|
73 |
+
agent_args = {"tool_call_limit": 20}
|
74 |
+
else:
|
75 |
+
agent_args = {}
|
76 |
+
|
77 |
+
agent_config = AgentConfig(
|
78 |
+
model_id=user_inputs.model_id,
|
79 |
+
model_args=model_args,
|
80 |
+
agent_args=agent_args,
|
81 |
+
tools=DEFAULT_TOOLS,
|
82 |
+
)
|
83 |
+
|
84 |
+
config = Config(
|
85 |
+
location=user_inputs.location,
|
86 |
+
max_driving_hours=user_inputs.max_driving_hours,
|
87 |
+
date=user_inputs.date,
|
88 |
+
framework=user_inputs.framework,
|
89 |
+
main_agent=agent_config,
|
90 |
+
managed_agents=[],
|
91 |
+
evaluation_cases=[user_inputs.evaluation_case],
|
92 |
+
)
|
93 |
+
|
94 |
+
agent = await AnyAgent.create_async(
|
95 |
+
agent_framework=config.framework,
|
96 |
+
agent_config=config.main_agent,
|
97 |
+
managed_agents=config.managed_agents,
|
98 |
+
tracing=TracingConfig(console=True, cost_info=True),
|
99 |
+
)
|
100 |
+
return agent, config
|
101 |
+
|
102 |
+
|
103 |
+
async def display_output(agent_trace: AgentTrace):
|
104 |
+
# Display the agent trace in a more organized way
|
105 |
+
with st.expander("### π§© Agent Trace"):
|
106 |
+
for span in agent_trace.spans:
|
107 |
+
# Header with name and status
|
108 |
+
col1, col2 = st.columns([4, 1])
|
109 |
+
with col1:
|
110 |
+
st.markdown(f"**{span.name}**")
|
111 |
+
if span.attributes:
|
112 |
+
# st.json(span.attributes, expanded=False)
|
113 |
+
if "input.value" in span.attributes:
|
114 |
+
try:
|
115 |
+
input_value = json.loads(span.attributes["input.value"])
|
116 |
+
if isinstance(input_value, list) and len(input_value) > 0:
|
117 |
+
st.write(f"Input: {input_value[-1]}")
|
118 |
+
else:
|
119 |
+
st.write(f"Input: {input_value}")
|
120 |
+
except Exception: # noqa: E722
|
121 |
+
st.write(f"Input: {span.attributes['input.value']}")
|
122 |
+
if "output.value" in span.attributes:
|
123 |
+
try:
|
124 |
+
output_value = json.loads(span.attributes["output.value"])
|
125 |
+
if isinstance(output_value, list) and len(output_value) > 0:
|
126 |
+
st.write(f"Output: {output_value[-1]}")
|
127 |
+
else:
|
128 |
+
st.write(f"Output: {output_value}")
|
129 |
+
except Exception: # noqa: E722
|
130 |
+
st.write(f"Output: {span.attributes['output.value']}")
|
131 |
+
with col2:
|
132 |
+
status_color = (
|
133 |
+
"green" if span.status.status_code == StatusCode.OK else "red"
|
134 |
+
)
|
135 |
+
st.markdown(
|
136 |
+
f"<span style='color: {status_color}'>β {span.status.status_code.name}</span>",
|
137 |
+
unsafe_allow_html=True,
|
138 |
+
)
|
139 |
+
|
140 |
+
with st.expander("### π Results", expanded=True):
|
141 |
+
time_col, cost_col, tokens_col = st.columns(3)
|
142 |
+
duration = agent_trace.duration.total_seconds()
|
143 |
+
with time_col:
|
144 |
+
st.info(f"β±οΈ Execution Time: {duration:0.2f} seconds")
|
145 |
+
with cost_col:
|
146 |
+
st.info(f"π° Estimated Cost: ${agent_trace.cost.total_cost:.6f}")
|
147 |
+
with tokens_col:
|
148 |
+
st.info(f"π¦ Total Tokens: {agent_trace.usage.total_tokens:,}")
|
149 |
+
st.markdown("#### Final Output")
|
150 |
+
st.info(agent_trace.final_output)
|
151 |
+
|
152 |
+
|
153 |
+
async def run_agent(agent, config) -> AgentTrace:
|
154 |
+
st.markdown("#### π Running Surf Spot Finder with query")
|
155 |
+
|
156 |
+
query = config.input_prompt_template.format(
|
157 |
+
LOCATION=config.location,
|
158 |
+
MAX_DRIVING_HOURS=config.max_driving_hours,
|
159 |
+
DATE=config.date,
|
160 |
+
)
|
161 |
+
|
162 |
+
st.code(query, language="text")
|
163 |
+
kwargs = {}
|
164 |
+
if (
|
165 |
+
config.framework == AgentFramework.OPENAI
|
166 |
+
or config.framework == AgentFramework.TINYAGENT
|
167 |
+
):
|
168 |
+
kwargs["max_turns"] = 20
|
169 |
+
elif config.framework == AgentFramework.SMOLAGENTS:
|
170 |
+
kwargs["max_steps"] = 20
|
171 |
+
if config.framework == AgentFramework.LANGCHAIN:
|
172 |
+
from langchain_core.runnables import RunnableConfig
|
173 |
+
|
174 |
+
kwargs["config"] = RunnableConfig(recursion_limit=20)
|
175 |
+
elif config.framework == AgentFramework.GOOGLE:
|
176 |
+
from google.adk.agents.run_config import RunConfig
|
177 |
+
|
178 |
+
kwargs["run_config"] = RunConfig(max_llm_calls=20)
|
179 |
+
|
180 |
+
with st.status("Agent is running...", expanded=False, state="running") as status:
|
181 |
+
|
182 |
+
def update_span(span: AgentSpan):
|
183 |
+
# Process input value
|
184 |
+
input_value = span.attributes.get("input.value", "")
|
185 |
+
if input_value:
|
186 |
+
try:
|
187 |
+
parsed_input = json.loads(input_value)
|
188 |
+
if isinstance(parsed_input, list) and len(parsed_input) > 0:
|
189 |
+
input_value = str(parsed_input[-1])
|
190 |
+
except Exception:
|
191 |
+
pass
|
192 |
+
|
193 |
+
# Process output value
|
194 |
+
output_value = span.attributes.get("output.value", "")
|
195 |
+
if output_value:
|
196 |
+
try:
|
197 |
+
parsed_output = json.loads(output_value)
|
198 |
+
if isinstance(parsed_output, list) and len(parsed_output) > 0:
|
199 |
+
output_value = str(parsed_output[-1])
|
200 |
+
except Exception:
|
201 |
+
pass
|
202 |
+
|
203 |
+
# Truncate long values
|
204 |
+
max_length = 800
|
205 |
+
if len(input_value) > max_length:
|
206 |
+
input_value = f"[Truncated]...{input_value[-max_length:]}"
|
207 |
+
if len(output_value) > max_length:
|
208 |
+
output_value = f"[Truncated]...{output_value[-max_length:]}"
|
209 |
+
|
210 |
+
# Create a cleaner message format
|
211 |
+
if input_value or output_value:
|
212 |
+
message = f"Step: {span.name}\n"
|
213 |
+
if input_value:
|
214 |
+
message += f"Input: {input_value}\n"
|
215 |
+
if output_value:
|
216 |
+
message += f"Output: {output_value}"
|
217 |
+
else:
|
218 |
+
message = f"Step: {span.name}\n{span}"
|
219 |
+
|
220 |
+
status.update(label=message, expanded=False, state="running")
|
221 |
+
|
222 |
+
export_logs(agent, update_span)
|
223 |
+
agent_trace: AgentTrace = await agent.run_async(query, **kwargs)
|
224 |
+
status.update(label="Finished!", expanded=False, state="complete")
|
225 |
+
|
226 |
+
agent.exit()
|
227 |
+
return agent_trace
|