Chrisyichuan commited on
Commit
e1255d1
Β·
1 Parent(s): b623f54

fix temp? but seems not work?

Browse files
Files changed (3) hide show
  1. benchmark.py +3 -0
  2. geo_bot.py +9 -1
  3. main.py +14 -6
benchmark.py CHANGED
@@ -71,6 +71,7 @@ class MapGuesserBenchmark:
71
  self,
72
  models: Optional[List[str]] = None,
73
  max_samples: Optional[int] = None,
 
74
  **kwargs,
75
  ) -> Dict:
76
  if not self.golden_labels:
@@ -88,6 +89,7 @@ class MapGuesserBenchmark:
88
  print(f"πŸš€ Starting LIVE benchmark:")
89
  print(f" Models: {models_to_test}")
90
  print(f" Samples: {len(test_samples)}")
 
91
 
92
  all_results = []
93
  for model_name in models_to_test:
@@ -100,6 +102,7 @@ class MapGuesserBenchmark:
100
  model_name=model_class_name,
101
  use_selenium=True,
102
  headless=self.headless,
 
103
  ) as bot:
104
  for i, sample in enumerate(test_samples):
105
  print('########################################################')
 
71
  self,
72
  models: Optional[List[str]] = None,
73
  max_samples: Optional[int] = None,
74
+ temperature: float = 0.0,
75
  **kwargs,
76
  ) -> Dict:
77
  if not self.golden_labels:
 
89
  print(f"πŸš€ Starting LIVE benchmark:")
90
  print(f" Models: {models_to_test}")
91
  print(f" Samples: {len(test_samples)}")
92
+ print(f" Temperature: {temperature}")
93
 
94
  all_results = []
95
  for model_name in models_to_test:
 
102
  model_name=model_class_name,
103
  use_selenium=True,
104
  headless=self.headless,
105
+ temperature=temperature,
106
  ) as bot:
107
  for i, sample in enumerate(test_samples):
108
  print('########################################################')
geo_bot.py CHANGED
@@ -63,9 +63,17 @@ class GeoBot:
63
  model_name: str,
64
  use_selenium: bool = True,
65
  headless: bool = False,
 
66
  ):
67
- self.model = model(model=model_name)
 
 
 
 
 
 
68
  self.model_name = model_name
 
69
  self.use_selenium = use_selenium
70
  self.controller = (
71
  MapCrunchController(headless=headless) if use_selenium else None
 
63
  model_name: str,
64
  use_selenium: bool = True,
65
  headless: bool = False,
66
+ temperature: float = 0.0,
67
  ):
68
+ # Initialize model with temperature parameter
69
+ model_kwargs = {
70
+ "model": model_name,
71
+ "temperature": temperature,
72
+ }
73
+
74
+ self.model = model(**model_kwargs)
75
  self.model_name = model_name
76
+ self.temperature = temperature
77
  self.use_selenium = use_selenium
78
  self.controller = (
79
  MapCrunchController(headless=headless) if use_selenium else None
main.py CHANGED
@@ -12,13 +12,13 @@ from benchmark import MapGuesserBenchmark
12
  from config import MODELS_CONFIG, DATA_PATHS, SUCCESS_THRESHOLD_KM
13
 
14
 
15
- def agent_mode(model_name: str, steps: int, headless: bool, samples: int):
16
  """
17
  Runs the AI Agent in a benchmark loop over multiple samples,
18
  using multi-step exploration for each.
19
  """
20
  print(
21
- f"Starting Agent Mode (as a benchmark): model={model_name}, steps={steps}, samples={samples}"
22
  )
23
 
24
  try:
@@ -44,7 +44,7 @@ def agent_mode(model_name: str, steps: int, headless: bool, samples: int):
44
  all_results = []
45
 
46
  with GeoBot(
47
- model=model_class, model_name=model_instance_name, headless=headless
48
  ) as bot:
49
  for i, sample in enumerate(test_samples):
50
  print(
@@ -107,11 +107,11 @@ def agent_mode(model_name: str, steps: int, headless: bool, samples: int):
107
  print("\nAgent Mode finished.")
108
 
109
 
110
- def benchmark_mode(models: list, samples: int, headless: bool):
111
  """Runs the benchmark on pre-collected data."""
112
- print(f"Starting Benchmark Mode: models={models}, samples={samples}")
113
  benchmark = MapGuesserBenchmark(headless=headless)
114
- summary = benchmark.run_benchmark(models=models, max_samples=samples)
115
  if summary:
116
  print("\n--- Benchmark Complete! Summary ---")
117
  for model, stats in summary.items():
@@ -152,6 +152,12 @@ def main():
152
  choices=list(MODELS_CONFIG.keys()),
153
  help="[Benchmark] Models to benchmark.",
154
  )
 
 
 
 
 
 
155
 
156
  args = parser.parse_args()
157
 
@@ -161,12 +167,14 @@ def main():
161
  steps=args.steps,
162
  headless=args.headless,
163
  samples=args.samples,
 
164
  )
165
  elif args.mode == "benchmark":
166
  benchmark_mode(
167
  models=args.models or [args.model],
168
  samples=args.samples,
169
  headless=args.headless,
 
170
  )
171
 
172
 
 
12
  from config import MODELS_CONFIG, DATA_PATHS, SUCCESS_THRESHOLD_KM
13
 
14
 
15
+ def agent_mode(model_name: str, steps: int, headless: bool, samples: int, temperature: float = 0.0):
16
  """
17
  Runs the AI Agent in a benchmark loop over multiple samples,
18
  using multi-step exploration for each.
19
  """
20
  print(
21
+ f"Starting Agent Mode (as a benchmark): model={model_name}, steps={steps}, samples={samples}, temperature={temperature}"
22
  )
23
 
24
  try:
 
44
  all_results = []
45
 
46
  with GeoBot(
47
+ model=model_class, model_name=model_instance_name, headless=headless, temperature=temperature
48
  ) as bot:
49
  for i, sample in enumerate(test_samples):
50
  print(
 
107
  print("\nAgent Mode finished.")
108
 
109
 
110
+ def benchmark_mode(models: list, samples: int, headless: bool, temperature: float = 0.0):
111
  """Runs the benchmark on pre-collected data."""
112
+ print(f"Starting Benchmark Mode: models={models}, samples={samples}, temperature={temperature}")
113
  benchmark = MapGuesserBenchmark(headless=headless)
114
+ summary = benchmark.run_benchmark(models=models, max_samples=samples, temperature=temperature)
115
  if summary:
116
  print("\n--- Benchmark Complete! Summary ---")
117
  for model, stats in summary.items():
 
152
  choices=list(MODELS_CONFIG.keys()),
153
  help="[Benchmark] Models to benchmark.",
154
  )
155
+ parser.add_argument(
156
+ "--temperature",
157
+ type=float,
158
+ default=0.0,
159
+ help="Temperature parameter for LLM sampling (0.0 = deterministic, higher = more random). Default: 0.0",
160
+ )
161
 
162
  args = parser.parse_args()
163
 
 
167
  steps=args.steps,
168
  headless=args.headless,
169
  samples=args.samples,
170
+ temperature=args.temperature,
171
  )
172
  elif args.mode == "benchmark":
173
  benchmark_mode(
174
  models=args.models or [args.model],
175
  samples=args.samples,
176
  headless=args.headless,
177
+ temperature=args.temperature,
178
  )
179
 
180