MrSimple01 commited on
Commit
fb27dda
·
verified ·
1 Parent(s): db0eaac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -148
app.py CHANGED
@@ -1,9 +1,7 @@
1
- import os
2
  import warnings
3
  import time
4
  from typing import Dict, Tuple, List
5
  from dataclasses import dataclass
6
- from pathlib import Path
7
 
8
  import numpy as np
9
  import pandas as pd
@@ -22,10 +20,6 @@ class EvaluationConfig:
22
  api_key: str
23
  model_name: str = "gemini-1.5-flash"
24
  batch_size: int = 5
25
- retry_attempts: int = 5
26
- min_wait: int = 4
27
- max_wait: int = 60
28
- score_scale: Tuple[int, int] = (0, 100)
29
 
30
  class EvaluationPrompts:
31
  @staticmethod
@@ -63,9 +57,7 @@ class EvaluationPrompts:
63
  Выведите оценки в точном формате:
64
  Креативность: [число]
65
  Разнообразие: [число]
66
- Релевантность: [число]
67
-
68
- Затем подробно объясните каждую оценку, используя примеры из ответа. Если какая-то оценка ниже 50, дайте конкретные рекомендации по улучшению."""
69
 
70
  @staticmethod
71
  def get_third_check(original_prompt: str, response: str) -> str:
@@ -226,22 +218,6 @@ class StabilityEvaluator:
226
  'individual_similarities': stability_coefficients
227
  }
228
 
229
- def evaluate_dataset(self, df, prompt_col='rus_prompt'):
230
- """Evaluate stability for multiple answer columns"""
231
- results = {}
232
-
233
- # Find columns ending with '_answers'
234
- answer_columns = [col for col in df.columns if col.endswith('_answers')]
235
-
236
- for column in answer_columns:
237
- model_name = column.replace('_answers', '')
238
- results[model_name] = self.calculate_similarity(
239
- df[prompt_col].tolist(),
240
- df[column].tolist()
241
- )
242
-
243
- return results
244
-
245
 
246
  class BenchmarkEvaluator:
247
  def __init__(self, gemini_api_key):
@@ -314,142 +290,53 @@ class BenchmarkEvaluator:
314
  return benchmark_df
315
 
316
 
317
- def evaluate_single_response(gemini_api_key, prompt, response, model_name="Test Model"):
318
- """Evaluate a single response for the UI"""
319
- # Create a temporary dataframe
320
- df = pd.DataFrame({
321
- 'rus_prompt': [prompt],
322
- f'{model_name}_answers': [response]
323
- })
324
-
325
- evaluator = BenchmarkEvaluator(gemini_api_key)
326
-
327
- try:
328
- result = evaluator.evaluate_model(df, model_name)
329
-
330
- # Format the result for displaying in UI
331
- output = {
332
- 'Creativity Score': f"{result['creative_details']['creativity']:.2f}",
333
- 'Diversity Score': f"{result['creative_details']['diversity']:.2f}",
334
- 'Relevance Score': f"{result['creative_details']['relevance']:.2f}",
335
- 'Average Creative Score': f"{result['creativity_score']:.2f}",
336
- 'Stability Score': f"{result['stability_score']:.2f}",
337
- 'Combined Score': f"{result['combined_score']:.2f}"
338
- }
339
-
340
- return output
341
- except Exception as e:
342
- return {
343
- 'Error': str(e)
344
- }
345
-
346
-
347
- def evaluate_batch(api_key, file, prompt_column, models_text):
348
- """Process batch evaluation from the UI"""
349
- try:
350
- # Load the CSV file
351
- file_path = file.name
352
- df = pd.read_csv(file_path)
353
-
354
- # Process model names if provided
355
- models = None
356
- if models_text.strip():
357
- models = [m.strip() for m in models_text.split(',')]
358
-
359
- # Run the evaluation
360
- evaluator = BenchmarkEvaluator(api_key)
361
- results = evaluator.evaluate_all_models(df, models, prompt_column)
362
-
363
- return results
364
- except Exception as e:
365
- return pd.DataFrame({'Error': [str(e)]})
366
-
367
-
368
  def create_gradio_interface():
369
- """Create Gradio interface for evaluation app"""
370
  with gr.Blocks(title="Model Response Evaluator") as app:
371
  gr.Markdown("# Model Response Evaluator")
372
- gr.Markdown("Evaluate model responses for creativity, diversity, relevance, and stability.")
373
 
374
- with gr.Tab("Single Response Evaluation"):
375
- with gr.Row():
376
- gemini_api_key = gr.Textbox(label="Gemini API Key", type="password")
377
-
378
- with gr.Row():
379
- with gr.Column():
380
- prompt = gr.Textbox(label="Original Prompt", lines=3)
381
- response = gr.Textbox(label="Model Response", lines=6)
382
- model_name = gr.Textbox(label="Model Name", value="Test Model")
383
-
384
- evaluate_btn = gr.Button("Evaluate Response")
385
 
386
- with gr.Column():
387
- output = gr.JSON(label="Evaluation Results")
388
-
389
- evaluate_btn.click(
390
- evaluate_single_response,
391
- inputs=[gemini_api_key, prompt, response, model_name],
392
- outputs=output
393
- )
394
 
395
- with gr.Tab("Batch Evaluation"):
396
- with gr.Row():
397
- gemini_api_key_batch = gr.Textbox(label="Gemini API Key", type="password")
398
-
399
- with gr.Row():
400
- csv_file = gr.File(label="Upload CSV with responses")
401
- prompt_col = gr.Textbox(label="Prompt Column Name", value="rus_prompt")
402
- models_input = gr.Textbox(label="Model names (comma-separated, leave blank for auto-detection)")
403
-
404
- evaluate_batch_btn = gr.Button("Run Benchmark")
405
  benchmark_output = gr.DataFrame(label="Benchmark Results")
406
-
407
- evaluate_batch_btn.click(
408
- evaluate_batch,
409
- inputs=[gemini_api_key_batch, csv_file, prompt_col, models_input],
410
- outputs=benchmark_output
411
- )
412
 
413
- # Add a new tab for configuration settings
414
- with gr.Tab("Configuration"):
415
- gr.Markdown("## Advanced Configuration")
416
- gr.Markdown("Adjust evaluation parameters to customize the benchmarking process.")
417
-
418
- with gr.Row():
419
- batch_size = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Batch Size")
420
- retry_attempts = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Retry Attempts")
421
-
422
- with gr.Row():
423
- min_wait = gr.Slider(minimum=1, maximum=30, value=4, step=1, label="Minimum Wait Time (seconds)")
424
- max_wait = gr.Slider(minimum=10, maximum=300, value=60, step=10, label="Maximum Wait Time (seconds)")
425
-
426
- with gr.Row():
427
- gemini_model = gr.Dropdown(
428
- choices=["gemini-1.5-flash", "gemini-1.5-pro", "gemini-1.5-ultra"],
429
- value="gemini-1.5-flash",
430
- label="Gemini Model"
431
- )
432
 
433
- gr.Markdown("Note: Changes to configuration settings will apply to new evaluations.")
434
-
435
- def update_config(batch_size, retry_attempts, min_wait, max_wait, gemini_model):
436
- # This function doesn't actually do anything in the demo but would update global config
437
- return f"Configuration updated: batch_size={batch_size}, retry_attempts={retry_attempts}, min_wait={min_wait}, max_wait={max_wait}, model={gemini_model}"
438
-
439
- update_config_btn = gr.Button("Update Configuration")
440
- config_status = gr.Textbox(label="Status", interactive=False)
441
-
442
- update_config_btn.click(
443
- update_config,
444
- inputs=[batch_size, retry_attempts, min_wait, max_wait, gemini_model],
445
- outputs=config_status
446
- )
447
-
 
 
 
 
448
  return app
449
 
450
 
451
  def main():
452
- """Main function to run the application"""
453
  app = create_gradio_interface()
454
  app.launch(share=True)
455
 
 
 
1
  import warnings
2
  import time
3
  from typing import Dict, Tuple, List
4
  from dataclasses import dataclass
 
5
 
6
  import numpy as np
7
  import pandas as pd
 
20
  api_key: str
21
  model_name: str = "gemini-1.5-flash"
22
  batch_size: int = 5
 
 
 
 
23
 
24
  class EvaluationPrompts:
25
  @staticmethod
 
57
  Выведите оценки в точном формате:
58
  Креативность: [число]
59
  Разнообразие: [число]
60
+ Релевантность: [число]"""
 
 
61
 
62
  @staticmethod
63
  def get_third_check(original_prompt: str, response: str) -> str:
 
218
  'individual_similarities': stability_coefficients
219
  }
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  class BenchmarkEvaluator:
223
  def __init__(self, gemini_api_key):
 
290
  return benchmark_df
291
 
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  def create_gradio_interface():
 
294
  with gr.Blocks(title="Model Response Evaluator") as app:
295
  gr.Markdown("# Model Response Evaluator")
296
+ gr.Markdown("Upload a CSV file with prompts and model responses to evaluate and benchmark models.")
297
 
298
+ with gr.Row():
299
+ gemini_api_key = gr.Textbox(label="Gemini API Key", type="password")
 
 
 
 
 
 
 
 
 
300
 
301
+ with gr.Row():
302
+ csv_file = gr.File(label="Upload CSV with responses")
303
+ prompt_col = gr.Textbox(label="Prompt Column Name", value="rus_prompt")
304
+ models_input = gr.Textbox(label="Model names (comma-separated, leave blank for auto-detection)")
 
 
 
 
305
 
306
+ evaluate_btn = gr.Button("Run Benchmark")
307
+
308
+ with gr.Row():
 
 
 
 
 
 
 
309
  benchmark_output = gr.DataFrame(label="Benchmark Results")
 
 
 
 
 
 
310
 
311
+ def evaluate_batch(api_key, file, prompt_column, models_text):
312
+ try:
313
+ # Load the CSV file
314
+ file_path = file.name
315
+ df = pd.read_csv(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
+ # Process model names if provided
318
+ models = None
319
+ if models_text.strip():
320
+ models = [m.strip() for m in models_text.split(',')]
321
+
322
+ # Run the evaluation
323
+ evaluator = BenchmarkEvaluator(api_key)
324
+ results = evaluator.evaluate_all_models(df, models, prompt_column)
325
+
326
+ return results
327
+ except Exception as e:
328
+ return pd.DataFrame({'Error': [str(e)]})
329
+
330
+ evaluate_btn.click(
331
+ evaluate_batch,
332
+ inputs=[gemini_api_key, csv_file, prompt_col, models_input],
333
+ outputs=benchmark_output
334
+ )
335
+
336
  return app
337
 
338
 
339
  def main():
 
340
  app = create_gradio_interface()
341
  app.launch(share=True)
342