Spaces:
Running
Running
File size: 2,180 Bytes
27f8cfc 6fdc19a 62cf4ef 6fdc19a 6c3142a 6fdc19a 27f8cfc 178738b 27f8cfc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from any_agent.evaluation import EvaluationCase
from surf_spot_finder.tools import (
get_area_lat_lon,
get_wave_forecast,
get_wind_forecast,
)
from any_agent.tools.web_browsing import search_web, visit_webpage
MODEL_OPTIONS = [
# "huggingface/novita/deepseek-ai/DeepSeek-V3",
# "huggingface/novita/meta-llama/Llama-3.3-70B-Instruct",
"openai/gpt-4.1-nano",
"openai/gpt-4.1-mini",
"openai/gpt-4o",
"gemini/gemini-2.0-flash-lite",
"gemini/gemini-2.0-flash",
# "huggingface/Qwen/Qwen3-32B", # right now throwing an internal error, but novita qwen isn't supporting tool calling
]
# Novita was the only HF based provider that worked.
# Hugginface API Provider Error:
# Must alternate between assistant/user, which meant that the 'tool' role made it puke
DEFAULT_EVALUATION_CASE = EvaluationCase(
llm_judge=MODEL_OPTIONS[0],
checkpoints=[
{
"criteria": "Check if the agent considered at least three surf spot options",
"points": 1,
},
{
"criteria": "Check if the agent gathered wind forecasts for each surf spot being evaluated.",
"points": 1,
},
{
"criteria": "Check if the agent gathered wave forecasts for each surf spot being evaluated.",
"points": 1,
},
{
"criteria": "Check if the agent used any web search tools to explore which surf spots should be considered",
"points": 1,
},
{
"criteria": "Check if the final answer contains any description about the weather (air temp, chance of rain, etc) at the chosen location",
"points": 1,
},
{
"criteria": "Check if the final answer includes one of the surf spots evaluated by tools",
"points": 1,
},
{
"criteria": "Check if the final answer includes information about some alternative surf spots if the user is not satisfied with the chosen one",
"points": 1,
},
],
)
DEFAULT_TOOLS = [
get_wind_forecast,
get_wave_forecast,
get_area_lat_lon,
search_web,
visit_webpage,
]
|