MrSimple01 commited on
Commit
e2b92e5
·
verified ·
1 Parent(s): 21711d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -50
app.py CHANGED
@@ -417,58 +417,58 @@ def create_gradio_interface():
417
 
418
  model_input_method.change(toggle_model_input, model_input_method, model_config_row)
419
 
420
- def evaluate_batch(api_key, file, prompt_column, input_method, models_text, answer_cols_text):
421
- try:
422
- if not api_key:
423
- return None, None, gr.DataFrame(), gr.File()
424
-
425
- # Load the CSV file
426
- file_path = file.name
427
- df = pd.read_csv(file_path)
428
-
429
- # Initialize evaluator
430
- state['evaluator'] = BenchmarkEvaluator(api_key)
431
-
432
- # Process model names and columns if provided
433
- if input_method == "Specify models and columns":
434
- if not models_text.strip() or not answer_cols_text.strip():
435
- return None, None, gr.DataFrame(), gr.File()
436
-
437
- models = [m.strip() for m in models_text.split(',')]
438
- answer_cols = [c.strip() for c in answer_cols_text.split(',')]
439
-
440
- if len(models) != len(answer_cols):
441
- return None, None, gr.DataFrame(pd.DataFrame({'Error': ['Number of models and answer columns must match']})), gr.File()
442
-
443
- results_df, leaderboard_df = state['evaluator'].evaluate_all_models(
444
- df, models=models, model_columns=answer_cols, prompt_col=prompt_column
445
- )
446
- else:
447
- # Auto-detect mode
448
- results_df, leaderboard_df = state['evaluator'].evaluate_all_models(
449
- df, prompt_col=prompt_column
450
- )
451
 
452
- timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
453
- results_path = f'results/benchmark_results_{timestamp}.csv'
454
- results_df.to_csv(results_path, index=False)
455
 
456
- # Update state
457
- state['last_results'] = results_df
458
- state['leaderboard'] = leaderboard_df
459
 
460
- return results_df, leaderboard_df, results_path, leaderboard_df
461
- except Exception as e:
462
- error_df = pd.DataFrame({'Error': [str(e)]})
463
- return error_df, state['leaderboard'], gr.DataFrame(), gr.File()
464
-
465
- def download_results():
466
- if state['last_results'] is not None:
467
- timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
468
- file_path = f'results/benchmark_download_{timestamp}.csv'
469
- state['last_results'].to_csv(file_path, index=False)
470
- return file_path
471
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
  def refresh_leaderboard():
474
  # Reload leaderboard from file
@@ -479,7 +479,7 @@ def create_gradio_interface():
479
  evaluate_btn.click(
480
  evaluate_batch,
481
  inputs=[gemini_api_key, csv_file, prompt_col, model_input_method, models_input, answer_cols_input],
482
- outputs=[current_results, leaderboard_table, gr.DataFrame(), current_results_file]
483
  )
484
 
485
  download_btn.click(download_results, inputs=[], outputs=[current_results_file])
 
417
 
418
  model_input_method.change(toggle_model_input, model_input_method, model_config_row)
419
 
420
+ def evaluate_batch(api_key, file, prompt_column, input_method, models_text, answer_cols_text):
421
+ try:
422
+ if not api_key:
423
+ return None, None, None
424
+
425
+ # Load the CSV file
426
+ file_path = file.name
427
+ df = pd.read_csv(file_path)
428
+
429
+ # Initialize evaluator
430
+ state['evaluator'] = BenchmarkEvaluator(api_key)
431
+
432
+ # Process model names and columns if provided
433
+ if input_method == "Specify models and columns":
434
+ if not models_text.strip() or not answer_cols_text.strip():
435
+ return None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
437
+ models = [m.strip() for m in models_text.split(',')]
438
+ answer_cols = [c.strip() for c in answer_cols_text.split(',')]
 
439
 
440
+ if len(models) != len(answer_cols):
441
+ return pd.DataFrame({'Error': ['Number of models and answer columns must match']}), state['leaderboard'], None
 
442
 
443
+ results_df, leaderboard_df = state['evaluator'].evaluate_all_models(
444
+ df, models=models, model_columns=answer_cols, prompt_col=prompt_column
445
+ )
446
+ else:
447
+ # Auto-detect mode
448
+ results_df, leaderboard_df = state['evaluator'].evaluate_all_models(
449
+ df, prompt_col=prompt_column
450
+ )
451
+
452
+ timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
453
+ results_path = f'results/benchmark_results_{timestamp}.csv'
454
+ results_df.to_csv(results_path, index=False)
455
+
456
+ # Update state
457
+ state['last_results'] = results_df
458
+ state['leaderboard'] = leaderboard_df
459
+
460
+ return results_df, leaderboard_df, results_path
461
+ except Exception as e:
462
+ error_df = pd.DataFrame({'Error': [str(e)]})
463
+ return error_df, state['leaderboard'], None
464
+
465
+ def download_results():
466
+ if state['last_results'] is not None:
467
+ timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
468
+ file_path = f'results/benchmark_download_{timestamp}.csv'
469
+ state['last_results'].to_csv(file_path, index=False)
470
+ return file_path
471
+ return None
472
 
473
  def refresh_leaderboard():
474
  # Reload leaderboard from file
 
479
  evaluate_btn.click(
480
  evaluate_batch,
481
  inputs=[gemini_api_key, csv_file, prompt_col, model_input_method, models_input, answer_cols_input],
482
+ outputs=[current_results, leaderboard_table, current_results_file]
483
  )
484
 
485
  download_btn.click(download_results, inputs=[], outputs=[current_results_file])