File size: 2,180 Bytes
27f8cfc
 
 
 
 
 
 
 
6fdc19a
 
 
62cf4ef
 
 
6fdc19a
6c3142a
6fdc19a
 
 
 
 
 
 
27f8cfc
 
 
 
 
 
 
 
 
 
178738b
 
 
 
 
27f8cfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from any_agent.evaluation import EvaluationCase
from surf_spot_finder.tools import (
    get_area_lat_lon,
    get_wave_forecast,
    get_wind_forecast,
)
from any_agent.tools.web_browsing import search_web, visit_webpage

MODEL_OPTIONS = [
    # "huggingface/novita/deepseek-ai/DeepSeek-V3",
    # "huggingface/novita/meta-llama/Llama-3.3-70B-Instruct",
    "openai/gpt-4.1-nano",
    "openai/gpt-4.1-mini",
    "openai/gpt-4o",
    "gemini/gemini-2.0-flash-lite",
    "gemini/gemini-2.0-flash",
    # "huggingface/Qwen/Qwen3-32B", # right now throwing an internal error, but novita qwen isn't supporting tool calling
]

# Novita was the only HF based provider that worked.

# Hugginface API Provider Error:
# Must alternate between assistant/user, which meant that the 'tool' role made it puke


DEFAULT_EVALUATION_CASE = EvaluationCase(
    llm_judge=MODEL_OPTIONS[0],
    checkpoints=[
        {
            "criteria": "Check if the agent considered at least three surf spot options",
            "points": 1,
        },
        {
            "criteria": "Check if the agent gathered wind forecasts for each surf spot being evaluated.",
            "points": 1,
        },
        {
            "criteria": "Check if the agent gathered wave forecasts for each surf spot being evaluated.",
            "points": 1,
        },
        {
            "criteria": "Check if the agent used any web search tools to explore which surf spots should be considered",
            "points": 1,
        },
        {
            "criteria": "Check if the final answer contains any description about the weather (air temp, chance of rain, etc) at the chosen location",
            "points": 1,
        },
        {
            "criteria": "Check if the final answer includes one of the surf spots evaluated by tools",
            "points": 1,
        },
        {
            "criteria": "Check if the final answer includes information about some alternative surf spots if the user is not satisfied with the chosen one",
            "points": 1,
        },
    ],
)


DEFAULT_TOOLS = [
    get_wind_forecast,
    get_wave_forecast,
    get_area_lat_lon,
    search_web,
    visit_webpage,
]