MrSimple01 commited on
Commit
050fdc5
·
verified ·
1 Parent(s): fb27dda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +194 -39
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import warnings
2
  import time
 
3
  from typing import Dict, Tuple, List
4
  from dataclasses import dataclass
5
 
@@ -226,10 +227,32 @@ class BenchmarkEvaluator:
226
  EvaluationConfig(api_key=gemini_api_key)
227
  )
228
  self.stability_evaluator = StabilityEvaluator()
 
229
 
230
- def evaluate_model(self, df, model_name, prompt_col='rus_prompt'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  """Evaluate a single model's responses"""
232
- answer_col = f"{model_name}_answers"
 
 
233
 
234
  if answer_col not in df.columns:
235
  raise ValueError(f"Column {answer_col} not found in dataframe")
@@ -247,11 +270,15 @@ class BenchmarkEvaluator:
247
  stability_score = stability_results['stability_score']
248
  combined_score = (creative_score + stability_score) / 2
249
 
 
 
 
250
  results = {
251
  'model': model_name,
252
  'creativity_score': creative_score,
253
  'stability_score': stability_score,
254
  'combined_score': combined_score,
 
255
  'creative_details': {
256
  'creativity': creative_df["Креативность"].mean(),
257
  'diversity': creative_df["Разнообразие"].mean(),
@@ -261,36 +288,96 @@ class BenchmarkEvaluator:
261
  }
262
 
263
  # Save detailed results
264
- output_file = f'evaluated_responses_{model_name}.csv'
265
  creative_df.to_csv(output_file, index=False)
266
  print(f"Detailed results saved to {output_file}")
267
 
268
- return results
269
-
270
- def evaluate_all_models(self, df, models=None, prompt_col='rus_prompt'):
271
- """Evaluate multiple models from the dataframe"""
272
- if models is None:
273
- # Find all columns ending with _answers
274
- answer_cols = [col for col in df.columns if col.endswith('_answers')]
275
- models = [col.replace('_answers', '') for col in answer_cols]
276
-
277
- results = []
278
- for model in models:
279
- try:
280
- model_results = self.evaluate_model(df, model, prompt_col)
281
- results.append(model_results)
282
- print(f"Completed evaluation for {model}")
283
- except Exception as e:
284
- print(f"Error evaluating {model}: {str(e)}")
285
-
286
- benchmark_df = pd.DataFrame(results)
287
- benchmark_df.to_csv('benchmark_results.csv', index=False)
288
- print("Benchmark completed. Results saved to benchmark_results.csv")
289
 
290
- return benchmark_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
 
293
  def create_gradio_interface():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  with gr.Blocks(title="Model Response Evaluator") as app:
295
  gr.Markdown("# Model Response Evaluator")
296
  gr.Markdown("Upload a CSV file with prompts and model responses to evaluate and benchmark models.")
@@ -301,37 +388,105 @@ def create_gradio_interface():
301
  with gr.Row():
302
  csv_file = gr.File(label="Upload CSV with responses")
303
  prompt_col = gr.Textbox(label="Prompt Column Name", value="rus_prompt")
304
- models_input = gr.Textbox(label="Model names (comma-separated, leave blank for auto-detection)")
 
 
 
 
 
 
 
 
 
 
305
 
306
  evaluate_btn = gr.Button("Run Benchmark")
307
 
308
- with gr.Row():
309
- benchmark_output = gr.DataFrame(label="Benchmark Results")
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
- def evaluate_batch(api_key, file, prompt_column, models_text):
312
  try:
 
 
 
313
  # Load the CSV file
314
  file_path = file.name
315
  df = pd.read_csv(file_path)
316
 
317
- # Process model names if provided
318
- models = None
319
- if models_text.strip():
 
 
 
 
 
320
  models = [m.strip() for m in models_text.split(',')]
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
- # Run the evaluation
323
- evaluator = BenchmarkEvaluator(api_key)
324
- results = evaluator.evaluate_all_models(df, models, prompt_column)
325
 
326
- return results
 
 
 
 
327
  except Exception as e:
328
- return pd.DataFrame({'Error': [str(e)]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
  evaluate_btn.click(
331
  evaluate_batch,
332
- inputs=[gemini_api_key, csv_file, prompt_col, models_input],
333
- outputs=benchmark_output
334
  )
 
 
 
 
 
 
335
 
336
  return app
337
 
 
1
  import warnings
2
  import time
3
+ import os
4
  from typing import Dict, Tuple, List
5
  from dataclasses import dataclass
6
 
 
227
  EvaluationConfig(api_key=gemini_api_key)
228
  )
229
  self.stability_evaluator = StabilityEvaluator()
230
+ self.results_history = []
231
 
232
+ # Create results directory if it doesn't exist
233
+ os.makedirs('results', exist_ok=True)
234
+
235
+ # Load previous benchmark results if available
236
+ self.benchmark_file = 'results/benchmark_results.csv'
237
+ if os.path.exists(self.benchmark_file):
238
+ try:
239
+ self.leaderboard_df = pd.read_csv(self.benchmark_file)
240
+ except:
241
+ self.leaderboard_df = pd.DataFrame(columns=[
242
+ 'model', 'creativity_score', 'stability_score',
243
+ 'combined_score', 'evaluation_timestamp'
244
+ ])
245
+ else:
246
+ self.leaderboard_df = pd.DataFrame(columns=[
247
+ 'model', 'creativity_score', 'stability_score',
248
+ 'combined_score', 'evaluation_timestamp'
249
+ ])
250
+
251
+ def evaluate_model(self, df, model_name, prompt_col='rus_prompt', answer_col=None):
252
  """Evaluate a single model's responses"""
253
+ # Use direct answer column if provided, otherwise derive from model name
254
+ if answer_col is None:
255
+ answer_col = f"{model_name}_answers"
256
 
257
  if answer_col not in df.columns:
258
  raise ValueError(f"Column {answer_col} not found in dataframe")
 
270
  stability_score = stability_results['stability_score']
271
  combined_score = (creative_score + stability_score) / 2
272
 
273
+ # Add timestamp
274
+ timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
275
+
276
  results = {
277
  'model': model_name,
278
  'creativity_score': creative_score,
279
  'stability_score': stability_score,
280
  'combined_score': combined_score,
281
+ 'evaluation_timestamp': timestamp,
282
  'creative_details': {
283
  'creativity': creative_df["Креативность"].mean(),
284
  'diversity': creative_df["Разнообразие"].mean(),
 
288
  }
289
 
290
  # Save detailed results
291
+ output_file = f'results/evaluated_responses_{model_name}_{timestamp.replace(":", "-").replace(" ", "_")}.csv'
292
  creative_df.to_csv(output_file, index=False)
293
  print(f"Detailed results saved to {output_file}")
294
 
295
+ # Update leaderboard
296
+ result_row = {
297
+ 'model': model_name,
298
+ 'creativity_score': creative_score,
299
+ 'stability_score': stability_score,
300
+ 'combined_score': combined_score,
301
+ 'evaluation_timestamp': timestamp
302
+ }
303
+ self.leaderboard_df = pd.concat([self.leaderboard_df, pd.DataFrame([result_row])], ignore_index=True)
304
+ self.leaderboard_df.to_csv(self.benchmark_file, index=False)
 
 
 
 
 
 
 
 
 
 
 
305
 
306
+ self.results_history.append(results)
307
+ return results, creative_df
308
+
309
+ def evaluate_all_models(self, df, models=None, model_columns=None, prompt_col='rus_prompt'):
310
+ """Evaluate multiple models from the dataframe"""
311
+ if models is not None and model_columns is not None:
312
+ model_mapping = dict(zip(models, model_columns))
313
+ elif models is not None:
314
+ model_mapping = {model: f"{model}_answers" for model in models}
315
+ else:
316
+ answer_cols = [col for col in df.columns if col.endswith('_answers')]
317
+ models = [col.replace('_answers', '') for col in answer_cols]
318
+ model_mapping = dict(zip(models, answer_cols))
319
+
320
+ results = []
321
+ detail_dfs = []
322
+
323
+ for model, column in model_mapping.items():
324
+ try:
325
+ model_results, detail_df = self.evaluate_model(df, model, prompt_col, column)
326
+ results.append(model_results)
327
+ detail_dfs.append(detail_df)
328
+ print(f"Completed evaluation for {model}")
329
+ except Exception as e:
330
+ print(f"Error evaluating {model}: {str(e)}")
331
+
332
+ # Create combined results DataFrame
333
+ benchmark_df = pd.DataFrame([{
334
+ 'model': r['model'],
335
+ 'creativity_score': r['creativity_score'],
336
+ 'stability_score': r['stability_score'],
337
+ 'combined_score': r['combined_score'],
338
+ 'evaluation_timestamp': r['evaluation_timestamp']
339
+ } for r in results])
340
+
341
+ timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
342
+ benchmark_df.to_csv(f'results/benchmark_results_{timestamp}.csv', index=False)
343
+ print(f"Benchmark completed. Results saved to results/benchmark_results_{timestamp}.csv")
344
+
345
+ if detail_dfs:
346
+ combined_details = pd.concat(detail_dfs)
347
+ combined_details.to_csv(f'results/detailed_evaluation_{timestamp}.csv', index=False)
348
+ print(f"Detailed evaluation saved to results/detailed_evaluation_{timestamp}.csv")
349
+
350
+ return benchmark_df, self.leaderboard_df
351
+
352
+ def get_leaderboard(self):
353
+ """Return the current leaderboard"""
354
+ if self.leaderboard_df.empty:
355
+ return pd.DataFrame(columns=['model', 'creativity_score', 'stability_score', 'combined_score', 'evaluation_timestamp'])
356
+
357
+ # Sort by combined score (descending)
358
+ sorted_df = self.leaderboard_df.sort_values(by='combined_score', ascending=False)
359
+ return sorted_df
360
 
361
 
362
  def create_gradio_interface():
363
+ os.makedirs('results', exist_ok=True)
364
+
365
+ state = {
366
+ 'evaluator': None,
367
+ 'last_results': None,
368
+ 'leaderboard': None
369
+ }
370
+
371
+ # Load existing leaderboard if available
372
+ leaderboard_path = 'results/benchmark_results.csv'
373
+ if os.path.exists(leaderboard_path):
374
+ try:
375
+ state['leaderboard'] = pd.read_csv(leaderboard_path)
376
+ except:
377
+ state['leaderboard'] = pd.DataFrame(columns=['model', 'creativity_score', 'stability_score', 'combined_score', 'evaluation_timestamp'])
378
+ else:
379
+ state['leaderboard'] = pd.DataFrame(columns=['model', 'creativity_score', 'stability_score', 'combined_score', 'evaluation_timestamp'])
380
+
381
  with gr.Blocks(title="Model Response Evaluator") as app:
382
  gr.Markdown("# Model Response Evaluator")
383
  gr.Markdown("Upload a CSV file with prompts and model responses to evaluate and benchmark models.")
 
388
  with gr.Row():
389
  csv_file = gr.File(label="Upload CSV with responses")
390
  prompt_col = gr.Textbox(label="Prompt Column Name", value="rus_prompt")
391
+
392
+ with gr.Row():
393
+ model_input_method = gr.Radio(
394
+ choices=["Auto-detect from columns", "Specify models and columns"],
395
+ label="Model Input Method",
396
+ value="Auto-detect from columns"
397
+ )
398
+
399
+ with gr.Row(visible=False) as model_config_row:
400
+ models_input = gr.Textbox(label="Model names (comma-separated)")
401
+ answer_cols_input = gr.Textbox(label="Answer column names (comma-separated, matching model order)")
402
 
403
  evaluate_btn = gr.Button("Run Benchmark")
404
 
405
+ with gr.Tabs():
406
+ with gr.Tab("Current Results"):
407
+ current_results = gr.DataFrame(label="Current Benchmark Results")
408
+ download_btn = gr.Button("Download Results CSV")
409
+ current_results_file = gr.File(label="Download Results")
410
+
411
+ with gr.Tab("Leaderboard"):
412
+ leaderboard_table = gr.DataFrame(value=state['leaderboard'], label="Model Leaderboard")
413
+ refresh_btn = gr.Button("Refresh Leaderboard")
414
+
415
+ def toggle_model_input(choice):
416
+ return gr.Row(visible=(choice == "Specify models and columns"))
417
+
418
+ model_input_method.change(toggle_model_input, model_input_method, model_config_row)
419
 
420
+ def evaluate_batch(api_key, file, prompt_column, input_method, models_text, answer_cols_text):
421
  try:
422
+ if not api_key:
423
+ return None, None, gr.DataFrame(), gr.File()
424
+
425
  # Load the CSV file
426
  file_path = file.name
427
  df = pd.read_csv(file_path)
428
 
429
+ # Initialize evaluator
430
+ state['evaluator'] = BenchmarkEvaluator(api_key)
431
+
432
+ # Process model names and columns if provided
433
+ if input_method == "Specify models and columns":
434
+ if not models_text.strip() or not answer_cols_text.strip():
435
+ return None, None, gr.DataFrame(), gr.File()
436
+
437
  models = [m.strip() for m in models_text.split(',')]
438
+ answer_cols = [c.strip() for c in answer_cols_text.split(',')]
439
+
440
+ if len(models) != len(answer_cols):
441
+ return None, None, gr.DataFrame(pd.DataFrame({'Error': ['Number of models and answer columns must match']})), gr.File()
442
+
443
+ results_df, leaderboard_df = state['evaluator'].evaluate_all_models(
444
+ df, models=models, model_columns=answer_cols, prompt_col=prompt_column
445
+ )
446
+ else:
447
+ # Auto-detect mode
448
+ results_df, leaderboard_df = state['evaluator'].evaluate_all_models(
449
+ df, prompt_col=prompt_column
450
+ )
451
 
452
+ timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
453
+ results_path = f'results/benchmark_results_{timestamp}.csv'
454
+ results_df.to_csv(results_path, index=False)
455
 
456
+ # Update state
457
+ state['last_results'] = results_df
458
+ state['leaderboard'] = leaderboard_df
459
+
460
+ return results_df, leaderboard_df, results_path, leaderboard_df
461
  except Exception as e:
462
+ error_df = pd.DataFrame({'Error': [str(e)]})
463
+ return error_df, state['leaderboard'], gr.DataFrame(), gr.File()
464
+
465
+ def download_results():
466
+ if state['last_results'] is not None:
467
+ timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
468
+ file_path = f'results/benchmark_download_{timestamp}.csv'
469
+ state['last_results'].to_csv(file_path, index=False)
470
+ return file_path
471
+ return None
472
+
473
+ def refresh_leaderboard():
474
+ # Reload leaderboard from file
475
+ if os.path.exists('results/benchmark_results.csv'):
476
+ state['leaderboard'] = pd.read_csv('results/benchmark_results.csv')
477
+ return state['leaderboard']
478
 
479
  evaluate_btn.click(
480
  evaluate_batch,
481
+ inputs=[gemini_api_key, csv_file, prompt_col, model_input_method, models_input, answer_cols_input],
482
+ outputs=[current_results, leaderboard_table, gr.DataFrame(), current_results_file]
483
  )
484
+
485
+ download_btn.click(download_results, inputs=[], outputs=[current_results_file])
486
+ refresh_btn.click(refresh_leaderboard, inputs=[], outputs=[leaderboard_table])
487
+
488
+ # Initialize the leaderboard
489
+ leaderboard_table.value = state['leaderboard']
490
 
491
  return app
492