MrSimple01 commited on
Commit
abb648a
·
verified ·
1 Parent(s): 5465a38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -473
app.py CHANGED
@@ -1,500 +1,202 @@
1
- import warnings
2
- import time
3
  import os
4
- from typing import Dict, Tuple, List
5
- from dataclasses import dataclass
6
-
7
- import numpy as np
8
  import pandas as pd
9
- from tqdm.auto import tqdm
10
- import google.generativeai as genai
11
- from tenacity import retry, stop_after_attempt, wait_exponential
12
- from sentence_transformers import SentenceTransformer
13
- from sklearn.metrics.pairwise import cosine_similarity
14
  import gradio as gr
15
-
16
- # Suppress warnings
17
- warnings.filterwarnings("ignore")
18
-
19
- @dataclass
20
- class EvaluationConfig:
21
- api_key: str
22
- model_name: str = "gemini-1.5-flash"
23
- batch_size: int = 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- class EvaluationPrompts:
26
- @staticmethod
27
- def get_first_check(original_prompt: str, response: str) -> str:
28
- return f"""Оцените следующий ответ по шкале от 0 до 10:
29
- Оригинальный запрос: {original_prompt}
30
- Ответ: {response}
31
- Оцените по критериям:
32
- 1. Креативность (уникальность и оригинальность ответа)
33
- 2. Разнообразие (использование разных языковых средств)
34
- 3. Релевантность (соответствие запросу)
35
- Дайте только числовые оценки в формате:
36
- Креативность: [число]
37
- Разнообразие: [число]
38
- Релевантность: [число]"""
39
-
40
- @staticmethod
41
- def get_second_check(original_prompt: str, response: str) -> str:
42
- return f"""Вы — эксперт по оценке качества текстов, обладающий глубокими знаниями в области лингвистики, креативного письма и искусственного интеллекта. Ваша задача — объективно оценить представленный ответ по следующим критериям.
43
-
44
- ### **Оригинальный запрос:**
45
- {original_prompt}
46
-
47
- ### **Ответ:**
48
- {response}
49
-
50
- ## **Инструкция по оценке**
51
- Оцените ответ по шкале от 0 до 10 по трем критериям:
52
-
53
- 1. **Креативность** – Насколько ответ уникален и оригинален? Есть ли неожиданные, но уместные идеи?
54
- 2. **Разнообразие** – Использует ли ответ различные стилистические приемы, примеры, аналогии, синонимы? Насколько он выразителен?
55
- 3. **Релевантность** – Насколько ответ соответствует запросу? Полностью ли он отвечает на поставленный вопрос?
56
-
57
- ### **Формат ответа:**
58
- Выведите оценки в точном формате:
59
- Креативность: [число]
60
- Разнообразие: [число]
61
- Релевантность: [число]"""
62
-
63
- @staticmethod
64
- def get_third_check(original_prompt: str, response: str) -> str:
65
- return f"""Вы — эксперт по анализу текстов. Ваша задача — оценить ответ на запрос по шкале от 0 до 100 по трем критериям.
66
-
67
- ### **Оригинальный запрос:**
68
- {original_prompt}
69
-
70
- ### **Ответ:**
71
- {response}
72
-
73
- ## **Критерии оценки:**
74
- 1. **Креативность** – Насколько ответ уникален и оригинален? Используются ли необычные идеи и неожиданные подходы?
75
- 2. **Разнообразие** – Применяются ли разные языковые конструкции, примеры, аналогии, синонимы?
76
- 3. **Релевантность** – Насколько ответ соответствует запросу? Полностью ли он отвечает на поставленный вопрос?
77
-
78
- Выведите оценки в точном формате:
79
- Креативность: [число]
80
- Разнообразие: [число]
81
- Релевантность: [число]"""
82
-
83
-
84
- class ResponseEvaluator:
85
- def __init__(self, config: EvaluationConfig):
86
- """Initialize the evaluator with given configuration"""
87
- self.config = config
88
- self.model = self._setup_model()
89
-
90
- def _setup_model(self) -> genai.GenerativeModel:
91
- """Set up the Gemini model"""
92
- genai.configure(api_key=self.config.api_key)
93
- return genai.GenerativeModel(self.config.model_name)
94
 
95
- @retry(
96
- stop=stop_after_attempt(5),
97
- wait=wait_exponential(multiplier=1, min=4, max=60)
98
- )
99
- def evaluate_single_response(self, original_prompt: str, response: str) -> Tuple[Dict[str, float], str]:
100
- """Evaluate a single response using the configured model"""
101
- evaluation_prompts = self._create_evaluation_prompt(original_prompt, response)
102
- all_scores = []
103
- all_texts = []
104
-
105
- for prompt in evaluation_prompts:
106
- try:
107
- evaluation = self.model.generate_content(prompt)
108
- scores = self._parse_evaluation_scores(evaluation.text)
109
- all_scores.append(scores)
110
- all_texts.append(evaluation.text)
111
- except Exception as e:
112
- print(f"Error with prompt: {str(e)}")
113
- all_scores.append({
114
- "Креативность": 0,
115
- "Разнообразие": 0,
116
- "Релевантность": 0,
117
- "Среднее": 0
118
- })
119
- all_texts.append("Error in evaluation")
120
-
121
- final_scores = {
122
- "Креативность": np.mean([s.get("Креативность", 0) for s in all_scores]),
123
- "Разнообразие": np.mean([s.get("Разнообразие", 0) for s in all_scores]),
124
- "Релевантность": np.mean([s.get("Релевантность", 0) for s in all_scores])
125
- }
126
- final_scores["Среднее"] = np.mean(list(final_scores.values()))
127
-
128
- return final_scores, "\n\n".join(all_texts)
129
 
130
- def _create_evaluation_prompt(self, original_prompt: str, response: str) -> List[str]:
131
- """Create multiple evaluation prompts"""
132
- prompts = []
133
- prompts.append(EvaluationPrompts.get_first_check(original_prompt, response))
134
- prompts.append(EvaluationPrompts.get_second_check(original_prompt, response))
135
- prompts.append(EvaluationPrompts.get_third_check(original_prompt, response))
136
- return prompts
137
-
138
- def _parse_evaluation_scores(self, evaluation_text: str) -> Dict[str, float]:
139
- """Parse evaluation text into scores dictionary"""
140
- scores = {}
141
- for line in evaluation_text.strip().split('\n'):
142
- if ':' in line:
143
- parts = line.split(':')
144
- if len(parts) >= 2:
145
- metric, score_text = parts[0], ':'.join(parts[1:])
146
- try:
147
- score_text = score_text.strip()
148
- score = float(''.join(c for c in score_text if c.isdigit() or c == '.'))
149
- scores[metric.strip()] = score
150
- except ValueError:
151
- continue
152
-
153
- if scores:
154
- scores['Среднее'] = np.mean([v for k, v in scores.items() if k != 'Среднее'])
155
-
156
- return scores
157
 
158
- def evaluate_dataset(self, df: pd.DataFrame, prompt_col: str, answer_col: str) -> pd.DataFrame:
159
- """Evaluate all responses in the dataset"""
160
- evaluations = []
161
- eval_answers = []
162
-
163
- total_batches = (len(df) + self.config.batch_size - 1) // self.config.batch_size
164
-
165
- for i in range(0, len(df), self.config.batch_size):
166
- batch = df.iloc[i:i+self.config.batch_size]
167
-
168
- with tqdm(batch.iterrows(), total=len(batch),
169
- desc=f"Batch {i//self.config.batch_size + 1}/{total_batches}") as pbar:
170
- for _, row in pbar:
171
- try:
172
- scores, eval_text = self.evaluate_single_response(
173
- str(row[prompt_col]),
174
- str(row[answer_col])
175
- )
176
- evaluations.append(scores)
177
- eval_answers.append(eval_text)
178
- except Exception as e:
179
- print(f"Error processing row {_}: {str(e)}")
180
- evaluations.append({
181
- "Креативность": 0,
182
- "Разнообразие": 0,
183
- "Релевантность": 0,
184
- "Среднее": 0
185
- })
186
- eval_answers.append("Error in evaluation")
187
-
188
- time.sleep(2)
189
-
190
- time.sleep(10)
191
-
192
- return self._create_evaluation_dataframe(df, evaluations, eval_answers)
193
 
194
- def _create_evaluation_dataframe(self,
195
- original_df: pd.DataFrame,
196
- evaluations: List[Dict],
197
- eval_answers: List[str]) -> pd.DataFrame:
198
- score_df = pd.DataFrame(evaluations)
199
- df = original_df.copy()
200
- df['gemini_eval_answer'] = eval_answers
201
- return pd.concat([df, score_df], axis=1)
202
-
203
-
204
- class StabilityEvaluator:
205
- def __init__(self, model_name='paraphrase-MiniLM-L6-v2'):
206
- self.model = SentenceTransformer(model_name)
207
-
208
- def calculate_similarity(self, prompts, outputs):
209
- prompt_embeddings = self.model.encode(prompts)
210
- output_embeddings = self.model.encode(outputs)
211
-
212
- similarities = cosine_similarity(prompt_embeddings, output_embeddings)
213
-
214
- stability_coefficients = np.diag(similarities)
215
-
216
- return {
217
- 'stability_score': np.mean(stability_coefficients) * 100, # Scale to 0-100
218
- 'stability_std': np.std(stability_coefficients) * 100,
219
- 'individual_similarities': stability_coefficients
220
- }
221
-
222
-
223
- class BenchmarkEvaluator:
224
- def __init__(self, gemini_api_key):
225
- """Initialize both evaluators"""
226
- self.creative_evaluator = ResponseEvaluator(
227
- EvaluationConfig(api_key=gemini_api_key)
228
- )
229
- self.stability_evaluator = StabilityEvaluator()
230
- self.results_history = []
231
-
232
- # Create results directory if it doesn't exist
233
- os.makedirs('results', exist_ok=True)
234
-
235
- # Load previous benchmark results if available
236
- self.benchmark_file = 'results/benchmark_results.csv'
237
- if os.path.exists(self.benchmark_file):
238
- try:
239
- self.leaderboard_df = pd.read_csv(self.benchmark_file)
240
- except:
241
- self.leaderboard_df = pd.DataFrame(columns=[
242
- 'model', 'creativity_score', 'stability_score',
243
- 'combined_score', 'evaluation_timestamp'
244
- ])
245
- else:
246
- self.leaderboard_df = pd.DataFrame(columns=[
247
  'model', 'creativity_score', 'stability_score',
248
  'combined_score', 'evaluation_timestamp'
249
  ])
250
-
251
- def evaluate_model(self, df, model_name, prompt_col='rus_prompt', answer_col=None):
252
- """Evaluate a single model's responses"""
253
- # Use direct answer column if provided, otherwise derive from model name
254
- if answer_col is None:
255
- answer_col = f"{model_name}_answers"
256
-
257
- if answer_col not in df.columns:
258
- raise ValueError(f"Column {answer_col} not found in dataframe")
259
-
260
- print(f"Evaluating creativity for {model_name}...")
261
- creative_df = self.creative_evaluator.evaluate_dataset(df, prompt_col, answer_col)
262
-
263
- print(f"Evaluating stability for {model_name}...")
264
- stability_results = self.stability_evaluator.calculate_similarity(
265
- df[prompt_col].tolist(),
266
- df[answer_col].tolist()
267
- )
268
-
269
- creative_score = creative_df["Среднее"].mean()
270
- stability_score = stability_results['stability_score']
271
- combined_score = (creative_score + stability_score) / 2
272
-
273
- # Add timestamp
274
- timestamp = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
275
-
276
- results = {
277
- 'model': model_name,
278
- 'creativity_score': creative_score,
279
- 'stability_score': stability_score,
280
- 'combined_score': combined_score,
281
- 'evaluation_timestamp': timestamp,
282
- 'creative_details': {
283
- 'creativity': creative_df["Креативность"].mean(),
284
- 'diversity': creative_df["Разнообразие"].mean(),
285
- 'relevance': creative_df["Релевантность"].mean(),
286
- },
287
- 'stability_details': stability_results
288
- }
289
-
290
- # Save detailed results
291
- output_file = f'results/evaluated_responses_{model_name}_{timestamp.replace(":", "-").replace(" ", "_")}.csv'
292
- creative_df.to_csv(output_file, index=False)
293
- print(f"Detailed results saved to {output_file}")
294
-
295
- # Update leaderboard
296
- result_row = {
297
- 'model': model_name,
298
- 'creativity_score': creative_score,
299
- 'stability_score': stability_score,
300
- 'combined_score': combined_score,
301
- 'evaluation_timestamp': timestamp
302
- }
303
- self.leaderboard_df = pd.concat([self.leaderboard_df, pd.DataFrame([result_row])], ignore_index=True)
304
- self.leaderboard_df.to_csv(self.benchmark_file, index=False)
305
-
306
- self.results_history.append(results)
307
- return results, creative_df
308
-
309
- def evaluate_all_models(self, df, models=None, model_columns=None, prompt_col='rus_prompt'):
310
- """Evaluate multiple models from the dataframe"""
311
- if models is not None and model_columns is not None:
312
- model_mapping = dict(zip(models, model_columns))
313
- elif models is not None:
314
- model_mapping = {model: f"{model}_answers" for model in models}
315
- else:
316
- answer_cols = [col for col in df.columns if col.endswith('_answers')]
317
- models = [col.replace('_answers', '') for col in answer_cols]
318
- model_mapping = dict(zip(models, answer_cols))
319
-
320
- results = []
321
- detail_dfs = []
322
-
323
- for model, column in model_mapping.items():
324
- try:
325
- model_results, detail_df = self.evaluate_model(df, model, prompt_col, column)
326
- results.append(model_results)
327
- detail_dfs.append(detail_df)
328
- print(f"Completed evaluation for {model}")
329
- except Exception as e:
330
- print(f"Error evaluating {model}: {str(e)}")
331
-
332
- # Create combined results DataFrame
333
- benchmark_df = pd.DataFrame([{
334
- 'model': r['model'],
335
- 'creativity_score': r['creativity_score'],
336
- 'stability_score': r['stability_score'],
337
- 'combined_score': r['combined_score'],
338
- 'evaluation_timestamp': r['evaluation_timestamp']
339
- } for r in results])
340
-
341
- timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
342
- benchmark_df.to_csv(f'results/benchmark_results_{timestamp}.csv', index=False)
343
- print(f"Benchmark completed. Results saved to results/benchmark_results_{timestamp}.csv")
344
-
345
- if detail_dfs:
346
- combined_details = pd.concat(detail_dfs)
347
- combined_details.to_csv(f'results/detailed_evaluation_{timestamp}.csv', index=False)
348
- print(f"Detailed evaluation saved to results/detailed_evaluation_{timestamp}.csv")
349
-
350
- return benchmark_df, self.leaderboard_df
351
-
352
- def get_leaderboard(self):
353
- """Return the current leaderboard"""
354
- if self.leaderboard_df.empty:
355
- return pd.DataFrame(columns=['model', 'creativity_score', 'stability_score', 'combined_score', 'evaluation_timestamp'])
356
-
357
- # Sort by combined score (descending)
358
- sorted_df = self.leaderboard_df.sort_values(by='combined_score', ascending=False)
359
- return sorted_df
360
-
361
 
362
- def create_gradio_interface():
363
- os.makedirs('results', exist_ok=True)
364
 
365
- state = {
366
- 'evaluator': None,
367
- 'last_results': None,
368
- 'leaderboard': None
369
- }
370
-
371
- # Load existing leaderboard if available
372
- leaderboard_path = 'results/benchmark_results.csv'
373
- if os.path.exists(leaderboard_path):
374
  try:
375
- state['leaderboard'] = pd.read_csv(leaderboard_path)
376
- except:
377
- state['leaderboard'] = pd.DataFrame(columns=['model', 'creativity_score', 'stability_score', 'combined_score', 'evaluation_timestamp'])
378
- else:
379
- state['leaderboard'] = pd.DataFrame(columns=['model', 'creativity_score', 'stability_score', 'combined_score', 'evaluation_timestamp'])
380
-
381
- with gr.Blocks(title="Model Response Evaluator") as app:
382
- gr.Markdown("# Model Response Evaluator")
383
- gr.Markdown("Upload a CSV file with prompts and model responses to evaluate and benchmark models.")
384
-
385
- with gr.Row():
386
- gemini_api_key = gr.Textbox(label="Gemini API Key", type="password")
387
-
388
- with gr.Row():
389
- csv_file = gr.File(label="Upload CSV with responses")
390
- prompt_col = gr.Textbox(label="Prompt Column Name", value="rus_prompt")
391
-
392
- with gr.Row():
393
- model_input_method = gr.Radio(
394
- choices=["Auto-detect from columns", "Specify models and columns"],
395
- label="Model Input Method",
396
- value="Auto-detect from columns"
397
- )
398
-
399
- with gr.Row(visible=False) as model_config_row:
400
- models_input = gr.Textbox(label="Model names (comma-separated)")
401
- answer_cols_input = gr.Textbox(label="Answer column names (comma-separated, matching model order)")
402
-
403
- evaluate_btn = gr.Button("Run Benchmark")
404
-
405
- with gr.Tabs():
406
- with gr.Tab("Current Results"):
407
- current_results = gr.DataFrame(label="Current Benchmark Results")
408
- download_btn = gr.Button("Download Results CSV")
409
- current_results_file = gr.File(label="Download Results")
410
 
411
- with gr.Tab("Leaderboard"):
412
- leaderboard_table = gr.DataFrame(value=state['leaderboard'], label="Model Leaderboard")
413
- refresh_btn = gr.Button("Refresh Leaderboard")
414
-
415
- def toggle_model_input(choice):
416
- return gr.Row(visible=(choice == "Specify models and columns"))
417
-
418
- model_input_method.change(toggle_model_input, model_input_method, model_config_row)
419
-
420
- def evaluate_batch(api_key, file, prompt_column, input_method, models_text, answer_cols_text):
421
- try:
422
- if not api_key:
423
- return None, None, None
424
 
425
- # Load the CSV file
426
- file_path = file.name
427
- df = pd.read_csv(file_path)
 
428
 
429
- # Initialize evaluator
430
- state['evaluator'] = BenchmarkEvaluator(api_key)
431
 
432
- # Process model names and columns if provided
433
- if input_method == "Specify models and columns":
434
- if not models_text.strip() or not answer_cols_text.strip():
435
- return None, None, None
436
-
437
- models = [m.strip() for m in models_text.split(',')]
438
- answer_cols = [c.strip() for c in answer_cols_text.split(',')]
439
-
440
- if len(models) != len(answer_cols):
441
- return pd.DataFrame({'Error': ['Number of models and answer columns must match']}), state['leaderboard'], None
442
-
443
- results_df, leaderboard_df = state['evaluator'].evaluate_all_models(
444
- df, models=models, model_columns=answer_cols, prompt_col=prompt_column
445
- )
446
- else:
447
- # Auto-detect mode
448
- results_df, leaderboard_df = state['evaluator'].evaluate_all_models(
449
- df, prompt_col=prompt_column
450
- )
451
 
452
- timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
453
- results_path = f'results/benchmark_results_{timestamp}.csv'
454
- results_df.to_csv(results_path, index=False)
455
 
456
- # Update state
457
- state['last_results'] = results_df
458
- state['leaderboard'] = leaderboard_df
459
 
460
- return results_df, leaderboard_df, results_path
461
  except Exception as e:
462
- error_df = pd.DataFrame({'Error': [str(e)]})
463
- return error_df, state['leaderboard'], None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
 
465
- def download_results():
466
- if state['last_results'] is not None:
467
- timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
468
- file_path = f'results/benchmark_download_{timestamp}.csv'
469
- state['last_results'].to_csv(file_path, index=False)
470
- return file_path
471
- return None
472
-
473
- def refresh_leaderboard():
474
- # Reload leaderboard from file
475
- if os.path.exists('results/benchmark_results.csv'):
476
- state['leaderboard'] = pd.read_csv('results/benchmark_results.csv')
477
- return state['leaderboard']
478
-
479
- evaluate_btn.click(
480
- evaluate_batch,
481
- inputs=[gemini_api_key, csv_file, prompt_col, model_input_method, models_input, answer_cols_input],
482
- outputs=[current_results, leaderboard_table, current_results_file]
483
- )
484
-
485
- download_btn.click(download_results, inputs=[], outputs=[current_results_file])
486
- refresh_btn.click(refresh_leaderboard, inputs=[], outputs=[leaderboard_table])
487
 
488
- # Initialize the leaderboard
489
- leaderboard_table.value = state['leaderboard']
490
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
491
  return app
492
 
493
-
494
- def main():
495
- app = create_gradio_interface()
496
- app.launch(share=True)
497
-
498
-
499
  if __name__ == "__main__":
500
- main()
 
 
 
 
1
  import os
2
+ import time
 
 
 
3
  import pandas as pd
4
+ import numpy as np
 
 
 
 
5
  import gradio as gr
6
+ from typing import Dict, List, Optional
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+
10
+ # Import functions from our modules
11
+ from evaluate_creativity import evaluate_creativity
12
+ from evaluate_stability import (
13
+ evaluate_stability,
14
+ evaluate_combined_score,
15
+ create_radar_chart,
16
+ create_bar_chart,
17
+ get_leaderboard_data
18
+ )
19
+
20
+ def list_available_models(csv_file):
21
+ try:
22
+ df = pd.read_csv(csv_file)
23
+ model_columns = [col for col in df.columns if col.endswith('_answers')]
24
+ models = [col.replace('_answers', '') for col in model_columns]
25
+ return models
26
+ except Exception as e:
27
+ print(f"Error listing models: {str(e)}")
28
+ return []
29
+
30
+ def evaluate_models(file_path, api_key, prompt_col, selected_models=None, progress=gr.Progress()):
31
+ os.makedirs('results', exist_ok=True)
32
 
33
+ progress(0, desc="Loading data...")
34
+ df = pd.read_csv(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # Determine which models to evaluate
37
+ if selected_models:
38
+ answer_cols = [f"{model}_answers" for model in selected_models]
39
+ models = selected_models
40
+ else:
41
+ answer_cols = [col for col in df.columns if col.endswith('_answers')]
42
+ models = [col.replace('_answers', '') for col in answer_cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ model_mapping = dict(zip(models, answer_cols))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ progress(0.1, desc=f"Found {len(model_mapping)} models to evaluate")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ all_results = {}
49
+ all_creativity_dfs = {}
50
+
51
+ benchmark_file = 'results/benchmark_results.csv'
52
+ if os.path.exists(benchmark_file):
53
+ try:
54
+ benchmark_df = pd.read_csv(benchmark_file)
55
+ except:
56
+ benchmark_df = pd.DataFrame(columns=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  'model', 'creativity_score', 'stability_score',
58
  'combined_score', 'evaluation_timestamp'
59
  ])
60
+ else:
61
+ benchmark_df = pd.DataFrame(columns=[
62
+ 'model', 'creativity_score', 'stability_score',
63
+ 'combined_score', 'evaluation_timestamp'
64
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ progress_increment = 0.9 / len(model_mapping)
67
+ progress_current = 0.1
68
 
69
+ for model, column in model_mapping.items():
 
 
 
 
 
 
 
 
70
  try:
71
+ progress(progress_current, desc=f"Evaluating {model}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ # Evaluate creativity
74
+ creativity_df = evaluate_creativity(api_key, df, prompt_col, column, batch_size=1, progress=progress)
75
+ progress_current += progress_increment * 0.6
76
+ progress(progress_current, desc=f"Evaluating stability for {model}...")
 
 
 
 
 
 
 
 
 
77
 
78
+ # Evaluate stability
79
+ stability_results = evaluate_stability(df, prompt_col, column, progress=progress)
80
+ progress_current += progress_increment * 0.3
81
+ progress(progress_current, desc=f"Calculating combined score for {model}...")
82
 
83
+ # Calculate combined score
84
+ combined_results = evaluate_combined_score(creativity_df, stability_results, model)
85
 
86
+ # Save detailed results
87
+ timestamp = pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')
88
+ output_file = f'results/evaluated_responses_{model}_{timestamp}.csv'
89
+ creativity_df.to_csv(output_file, index=False)
90
+
91
+ # Add to benchmark DataFrame
92
+ result_row = {
93
+ 'model': model,
94
+ 'creativity_score': combined_results['creativity_score'],
95
+ 'stability_score': combined_results['stability_score'],
96
+ 'combined_score': combined_results['combined_score'],
97
+ 'evaluation_timestamp': combined_results['evaluation_timestamp']
98
+ }
99
+ benchmark_df = pd.concat([benchmark_df, pd.DataFrame([result_row])], ignore_index=True)
 
 
 
 
 
100
 
101
+ all_results[model] = combined_results
102
+ all_creativity_dfs[model] = creativity_df
 
103
 
104
+ progress_current += progress_increment * 0.1
105
+ progress(progress_current, desc=f"Finished evaluating {model}")
 
106
 
 
107
  except Exception as e:
108
+ print(f"Error evaluating {model}: {str(e)}")
109
+
110
+ # Save benchmark results
111
+ benchmark_df.to_csv(benchmark_file, index=False)
112
+ timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
113
+ combined_benchmark_path = f'results/benchmark_results_{timestamp}.csv'
114
+ benchmark_df.to_csv(combined_benchmark_path, index=False)
115
+
116
+ # Create visualizations
117
+ progress(0.95, desc="Creating visualizations...")
118
+ radar_chart_path = create_radar_chart(all_results)
119
+ bar_chart_path = create_bar_chart(all_results)
120
+
121
+ progress(1.0, desc="Evaluation complete!")
122
+
123
+ # Sort results by combined score
124
+ sorted_results = benchmark_df.sort_values(by='combined_score', ascending=False)
125
+
126
+ return sorted_results, radar_chart_path, bar_chart_path, combined_benchmark_path
127
+
128
+ def create_gradio_interface():
129
+ with gr.Blocks(title="LLM Evaluation Tool") as app:
130
+ gr.Markdown("# LLM Evaluation Tool")
131
+ gr.Markdown("Оцените модели на креативность, разнообразие, релевантность и стабильность")
132
+
133
+ with gr.Tab("Evaluate Models"):
134
+ with gr.Row():
135
+ with gr.Column():
136
+ file_input = gr.File(label="Upload CSV with prompts and responses")
137
+ api_key_input = gr.Textbox(label="Gemini API Key", type="password")
138
+ prompt_col_input = gr.Textbox(label="Prompt Column Name", value="rus_prompt")
139
+
140
+ model_selection = gr.CheckboxGroup(
141
+ label="Select Models to Evaluate (leave empty to evaluate all)",
142
+ choices=[],
143
+ interactive=True
144
+ )
145
+
146
+ refresh_button = gr.Button("Refresh Model List")
147
+
148
+ @refresh_button.click(inputs=[file_input], outputs=[model_selection])
149
+ def update_model_list(file):
150
+ if file:
151
+ models = list_available_models(file.name)
152
+ return gr.CheckboxGroup(choices=models)
153
+ return gr.CheckboxGroup(choices=[])
154
+
155
+ evaluate_button = gr.Button("Evaluate Models", variant="primary")
156
 
157
+ with gr.Row():
158
+ result_table = gr.Dataframe(label="Evaluation Results")
159
+
160
+ with gr.Row():
161
+ with gr.Column():
162
+ radar_chart = gr.Image(label="Radar Chart")
163
+
164
+ with gr.Column():
165
+ bar_chart = gr.Image(label="Bar Chart")
166
+
167
+ result_file = gr.File(label="Download Complete Results")
168
+
169
+ evaluate_button.click(
170
+ fn=evaluate_models,
171
+ inputs=[file_input, api_key_input, prompt_col_input, model_selection],
172
+ outputs=[result_table, radar_chart, bar_chart, result_file]
173
+ )
 
 
 
 
 
174
 
175
+ with gr.Tab("Leaderboard"):
176
+ with gr.Row():
177
+ leaderboard_table = gr.Dataframe(
178
+ label="Model Leaderboard",
179
+ headers=["Model", "Креативность", "Стабильность", "Общий балл"]
180
+ )
181
+
182
+ refresh_leaderboard = gr.Button("Refresh Leaderboard")
183
+
184
+ @refresh_leaderboard.click(outputs=[leaderboard_table])
185
+ def update_leaderboard():
186
+ return get_leaderboard_data()
187
+
188
+ with gr.Row():
189
+ gr.Markdown("### Leaderboard Details")
190
+ gr.Markdown("""
191
+ - **Креативность**: Оригинальность и инновационность ответов
192
+ - **Разнообразие**: Использование различных языковых средств и стилистическ��х приемов
193
+ - **Релевантность**: Соответствие ответа исходному запросу
194
+ - **Стабильность**: Насколько хорошо модель сохраняет смысл и контекст запроса
195
+ - **Общий балл**: Среднее значение всех показателей
196
+ """)
197
+
198
  return app
199
 
 
 
 
 
 
 
200
  if __name__ == "__main__":
201
+ app = create_gradio_interface()
202
+ app.launch()