MrSimple01 commited on
Commit
21711d5
·
verified ·
1 Parent(s): 050fdc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -44
app.py CHANGED
@@ -25,7 +25,7 @@ class EvaluationConfig:
25
  class EvaluationPrompts:
26
  @staticmethod
27
  def get_first_check(original_prompt: str, response: str) -> str:
28
- return f"""Оцените следующий ответ по шкале от 0 до 100:
29
  Оригинальный запрос: {original_prompt}
30
  Ответ: {response}
31
  Оцените по критериям:
@@ -48,7 +48,7 @@ class EvaluationPrompts:
48
  {response}
49
 
50
  ## **Инструкция по оценке**
51
- Оцените ответ по шкале от 0 до 100 по трем критериям:
52
 
53
  1. **Креативность** – Насколько ответ уникален и оригинален? Есть ли неожиданные, но уместные идеи?
54
  2. **Разнообразие** – Использует ли ответ различные стилистические приемы, примеры, аналогии, синонимы? Насколько он выразителен?
@@ -306,48 +306,48 @@ class BenchmarkEvaluator:
306
  self.results_history.append(results)
307
  return results, creative_df
308
 
309
- def evaluate_all_models(self, df, models=None, model_columns=None, prompt_col='rus_prompt'):
310
- """Evaluate multiple models from the dataframe"""
311
- if models is not None and model_columns is not None:
312
- model_mapping = dict(zip(models, model_columns))
313
- elif models is not None:
314
- model_mapping = {model: f"{model}_answers" for model in models}
315
- else:
316
- answer_cols = [col for col in df.columns if col.endswith('_answers')]
317
- models = [col.replace('_answers', '') for col in answer_cols]
318
- model_mapping = dict(zip(models, answer_cols))
319
-
320
- results = []
321
- detail_dfs = []
322
-
323
- for model, column in model_mapping.items():
324
- try:
325
- model_results, detail_df = self.evaluate_model(df, model, prompt_col, column)
326
- results.append(model_results)
327
- detail_dfs.append(detail_df)
328
- print(f"Completed evaluation for {model}")
329
- except Exception as e:
330
- print(f"Error evaluating {model}: {str(e)}")
331
-
332
- # Create combined results DataFrame
333
- benchmark_df = pd.DataFrame([{
334
- 'model': r['model'],
335
- 'creativity_score': r['creativity_score'],
336
- 'stability_score': r['stability_score'],
337
- 'combined_score': r['combined_score'],
338
- 'evaluation_timestamp': r['evaluation_timestamp']
339
- } for r in results])
340
-
341
- timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
342
- benchmark_df.to_csv(f'results/benchmark_results_{timestamp}.csv', index=False)
343
- print(f"Benchmark completed. Results saved to results/benchmark_results_{timestamp}.csv")
344
-
345
- if detail_dfs:
346
- combined_details = pd.concat(detail_dfs)
347
- combined_details.to_csv(f'results/detailed_evaluation_{timestamp}.csv', index=False)
348
- print(f"Detailed evaluation saved to results/detailed_evaluation_{timestamp}.csv")
349
-
350
- return benchmark_df, self.leaderboard_df
351
 
352
  def get_leaderboard(self):
353
  """Return the current leaderboard"""
 
25
  class EvaluationPrompts:
26
  @staticmethod
27
  def get_first_check(original_prompt: str, response: str) -> str:
28
+ return f"""Оцените следующий ответ по шкале от 0 до 10:
29
  Оригинальный запрос: {original_prompt}
30
  Ответ: {response}
31
  Оцените по критериям:
 
48
  {response}
49
 
50
  ## **Инструкция по оценке**
51
+ Оцените ответ по шкале от 0 до 10 по трем критериям:
52
 
53
  1. **Креативность** – Насколько ответ уникален и оригинален? Есть ли неожиданные, но уместные идеи?
54
  2. **Разнообразие** – Использует ли ответ различные стилистические приемы, примеры, аналогии, синонимы? Насколько он выразителен?
 
306
  self.results_history.append(results)
307
  return results, creative_df
308
 
309
+ def evaluate_all_models(self, df, models=None, model_columns=None, prompt_col='rus_prompt'):
310
+ """Evaluate multiple models from the dataframe"""
311
+ if models is not None and model_columns is not None:
312
+ model_mapping = dict(zip(models, model_columns))
313
+ elif models is not None:
314
+ model_mapping = {model: f"{model}_answers" for model in models}
315
+ else:
316
+ answer_cols = [col for col in df.columns if col.endswith('_answers')]
317
+ models = [col.replace('_answers', '') for col in answer_cols]
318
+ model_mapping = dict(zip(models, answer_cols))
319
+
320
+ results = []
321
+ detail_dfs = []
322
+
323
+ for model, column in model_mapping.items():
324
+ try:
325
+ model_results, detail_df = self.evaluate_model(df, model, prompt_col, column)
326
+ results.append(model_results)
327
+ detail_dfs.append(detail_df)
328
+ print(f"Completed evaluation for {model}")
329
+ except Exception as e:
330
+ print(f"Error evaluating {model}: {str(e)}")
331
+
332
+ # Create combined results DataFrame
333
+ benchmark_df = pd.DataFrame([{
334
+ 'model': r['model'],
335
+ 'creativity_score': r['creativity_score'],
336
+ 'stability_score': r['stability_score'],
337
+ 'combined_score': r['combined_score'],
338
+ 'evaluation_timestamp': r['evaluation_timestamp']
339
+ } for r in results])
340
+
341
+ timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
342
+ benchmark_df.to_csv(f'results/benchmark_results_{timestamp}.csv', index=False)
343
+ print(f"Benchmark completed. Results saved to results/benchmark_results_{timestamp}.csv")
344
+
345
+ if detail_dfs:
346
+ combined_details = pd.concat(detail_dfs)
347
+ combined_details.to_csv(f'results/detailed_evaluation_{timestamp}.csv', index=False)
348
+ print(f"Detailed evaluation saved to results/detailed_evaluation_{timestamp}.csv")
349
+
350
+ return benchmark_df, self.leaderboard_df
351
 
352
  def get_leaderboard(self):
353
  """Return the current leaderboard"""