Spaces:
Sleeping
Sleeping
Commit
Β·
e1255d1
1
Parent(s):
b623f54
fix temp? but seems not work?
Browse files- benchmark.py +3 -0
- geo_bot.py +9 -1
- main.py +14 -6
benchmark.py
CHANGED
@@ -71,6 +71,7 @@ class MapGuesserBenchmark:
|
|
71 |
self,
|
72 |
models: Optional[List[str]] = None,
|
73 |
max_samples: Optional[int] = None,
|
|
|
74 |
**kwargs,
|
75 |
) -> Dict:
|
76 |
if not self.golden_labels:
|
@@ -88,6 +89,7 @@ class MapGuesserBenchmark:
|
|
88 |
print(f"π Starting LIVE benchmark:")
|
89 |
print(f" Models: {models_to_test}")
|
90 |
print(f" Samples: {len(test_samples)}")
|
|
|
91 |
|
92 |
all_results = []
|
93 |
for model_name in models_to_test:
|
@@ -100,6 +102,7 @@ class MapGuesserBenchmark:
|
|
100 |
model_name=model_class_name,
|
101 |
use_selenium=True,
|
102 |
headless=self.headless,
|
|
|
103 |
) as bot:
|
104 |
for i, sample in enumerate(test_samples):
|
105 |
print('########################################################')
|
|
|
71 |
self,
|
72 |
models: Optional[List[str]] = None,
|
73 |
max_samples: Optional[int] = None,
|
74 |
+
temperature: float = 0.0,
|
75 |
**kwargs,
|
76 |
) -> Dict:
|
77 |
if not self.golden_labels:
|
|
|
89 |
print(f"π Starting LIVE benchmark:")
|
90 |
print(f" Models: {models_to_test}")
|
91 |
print(f" Samples: {len(test_samples)}")
|
92 |
+
print(f" Temperature: {temperature}")
|
93 |
|
94 |
all_results = []
|
95 |
for model_name in models_to_test:
|
|
|
102 |
model_name=model_class_name,
|
103 |
use_selenium=True,
|
104 |
headless=self.headless,
|
105 |
+
temperature=temperature,
|
106 |
) as bot:
|
107 |
for i, sample in enumerate(test_samples):
|
108 |
print('########################################################')
|
geo_bot.py
CHANGED
@@ -63,9 +63,17 @@ class GeoBot:
|
|
63 |
model_name: str,
|
64 |
use_selenium: bool = True,
|
65 |
headless: bool = False,
|
|
|
66 |
):
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
self.model_name = model_name
|
|
|
69 |
self.use_selenium = use_selenium
|
70 |
self.controller = (
|
71 |
MapCrunchController(headless=headless) if use_selenium else None
|
|
|
63 |
model_name: str,
|
64 |
use_selenium: bool = True,
|
65 |
headless: bool = False,
|
66 |
+
temperature: float = 0.0,
|
67 |
):
|
68 |
+
# Initialize model with temperature parameter
|
69 |
+
model_kwargs = {
|
70 |
+
"model": model_name,
|
71 |
+
"temperature": temperature,
|
72 |
+
}
|
73 |
+
|
74 |
+
self.model = model(**model_kwargs)
|
75 |
self.model_name = model_name
|
76 |
+
self.temperature = temperature
|
77 |
self.use_selenium = use_selenium
|
78 |
self.controller = (
|
79 |
MapCrunchController(headless=headless) if use_selenium else None
|
main.py
CHANGED
@@ -12,13 +12,13 @@ from benchmark import MapGuesserBenchmark
|
|
12 |
from config import MODELS_CONFIG, DATA_PATHS, SUCCESS_THRESHOLD_KM
|
13 |
|
14 |
|
15 |
-
def agent_mode(model_name: str, steps: int, headless: bool, samples: int):
|
16 |
"""
|
17 |
Runs the AI Agent in a benchmark loop over multiple samples,
|
18 |
using multi-step exploration for each.
|
19 |
"""
|
20 |
print(
|
21 |
-
f"Starting Agent Mode (as a benchmark): model={model_name}, steps={steps}, samples={samples}"
|
22 |
)
|
23 |
|
24 |
try:
|
@@ -44,7 +44,7 @@ def agent_mode(model_name: str, steps: int, headless: bool, samples: int):
|
|
44 |
all_results = []
|
45 |
|
46 |
with GeoBot(
|
47 |
-
model=model_class, model_name=model_instance_name, headless=headless
|
48 |
) as bot:
|
49 |
for i, sample in enumerate(test_samples):
|
50 |
print(
|
@@ -107,11 +107,11 @@ def agent_mode(model_name: str, steps: int, headless: bool, samples: int):
|
|
107 |
print("\nAgent Mode finished.")
|
108 |
|
109 |
|
110 |
-
def benchmark_mode(models: list, samples: int, headless: bool):
|
111 |
"""Runs the benchmark on pre-collected data."""
|
112 |
-
print(f"Starting Benchmark Mode: models={models}, samples={samples}")
|
113 |
benchmark = MapGuesserBenchmark(headless=headless)
|
114 |
-
summary = benchmark.run_benchmark(models=models, max_samples=samples)
|
115 |
if summary:
|
116 |
print("\n--- Benchmark Complete! Summary ---")
|
117 |
for model, stats in summary.items():
|
@@ -152,6 +152,12 @@ def main():
|
|
152 |
choices=list(MODELS_CONFIG.keys()),
|
153 |
help="[Benchmark] Models to benchmark.",
|
154 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
args = parser.parse_args()
|
157 |
|
@@ -161,12 +167,14 @@ def main():
|
|
161 |
steps=args.steps,
|
162 |
headless=args.headless,
|
163 |
samples=args.samples,
|
|
|
164 |
)
|
165 |
elif args.mode == "benchmark":
|
166 |
benchmark_mode(
|
167 |
models=args.models or [args.model],
|
168 |
samples=args.samples,
|
169 |
headless=args.headless,
|
|
|
170 |
)
|
171 |
|
172 |
|
|
|
12 |
from config import MODELS_CONFIG, DATA_PATHS, SUCCESS_THRESHOLD_KM
|
13 |
|
14 |
|
15 |
+
def agent_mode(model_name: str, steps: int, headless: bool, samples: int, temperature: float = 0.0):
|
16 |
"""
|
17 |
Runs the AI Agent in a benchmark loop over multiple samples,
|
18 |
using multi-step exploration for each.
|
19 |
"""
|
20 |
print(
|
21 |
+
f"Starting Agent Mode (as a benchmark): model={model_name}, steps={steps}, samples={samples}, temperature={temperature}"
|
22 |
)
|
23 |
|
24 |
try:
|
|
|
44 |
all_results = []
|
45 |
|
46 |
with GeoBot(
|
47 |
+
model=model_class, model_name=model_instance_name, headless=headless, temperature=temperature
|
48 |
) as bot:
|
49 |
for i, sample in enumerate(test_samples):
|
50 |
print(
|
|
|
107 |
print("\nAgent Mode finished.")
|
108 |
|
109 |
|
110 |
+
def benchmark_mode(models: list, samples: int, headless: bool, temperature: float = 0.0):
|
111 |
"""Runs the benchmark on pre-collected data."""
|
112 |
+
print(f"Starting Benchmark Mode: models={models}, samples={samples}, temperature={temperature}")
|
113 |
benchmark = MapGuesserBenchmark(headless=headless)
|
114 |
+
summary = benchmark.run_benchmark(models=models, max_samples=samples, temperature=temperature)
|
115 |
if summary:
|
116 |
print("\n--- Benchmark Complete! Summary ---")
|
117 |
for model, stats in summary.items():
|
|
|
152 |
choices=list(MODELS_CONFIG.keys()),
|
153 |
help="[Benchmark] Models to benchmark.",
|
154 |
)
|
155 |
+
parser.add_argument(
|
156 |
+
"--temperature",
|
157 |
+
type=float,
|
158 |
+
default=0.0,
|
159 |
+
help="Temperature parameter for LLM sampling (0.0 = deterministic, higher = more random). Default: 0.0",
|
160 |
+
)
|
161 |
|
162 |
args = parser.parse_args()
|
163 |
|
|
|
167 |
steps=args.steps,
|
168 |
headless=args.headless,
|
169 |
samples=args.samples,
|
170 |
+
temperature=args.temperature,
|
171 |
)
|
172 |
elif args.mode == "benchmark":
|
173 |
benchmark_mode(
|
174 |
models=args.models or [args.model],
|
175 |
samples=args.samples,
|
176 |
headless=args.headless,
|
177 |
+
temperature=args.temperature,
|
178 |
)
|
179 |
|
180 |
|