Spaces:

MrSimple01
/

RuSimulBench_arena

Sleeping

App Files Files Community

MrSimple01 commited on Mar 17

Commit

fb27dda

verified ·

1 Parent(s): db0eaac

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -148

app.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import os
 import warnings
 import time
 from typing import Dict, Tuple, List
 from dataclasses import dataclass
-from pathlib import Path
 import numpy as np
 import pandas as pd
@@ -22,10 +20,6 @@ class EvaluationConfig:
     api_key: str
     model_name: str = "gemini-1.5-flash"
     batch_size: int = 5
-    retry_attempts: int = 5
-    min_wait: int = 4
-    max_wait: int = 60
-    score_scale: Tuple[int, int] = (0, 100)
 class EvaluationPrompts:
     @staticmethod
@@ -63,9 +57,7 @@ class EvaluationPrompts:
 Выведите оценки в точном формате:
 Креативность: [число]
 Разнообразие: [число]
-Релевантность: [число]
-Затем подробно объясните каждую оценку, используя примеры из ответа. Если какая-то оценка ниже 50, дайте конкретные рекомендации по улучшению."""
     @staticmethod
     def get_third_check(original_prompt: str, response: str) -> str:
@@ -226,22 +218,6 @@ class StabilityEvaluator:
             'individual_similarities': stability_coefficients
         }
-    def evaluate_dataset(self, df, prompt_col='rus_prompt'):
-        """Evaluate stability for multiple answer columns"""
-        results = {}
-        # Find columns ending with '_answers'
-        answer_columns = [col for col in df.columns if col.endswith('_answers')]
-        for column in answer_columns:
-            model_name = column.replace('_answers', '')
-            results[model_name] = self.calculate_similarity(
-                df[prompt_col].tolist(),
-                df[column].tolist()
-            )
-        return results
 class BenchmarkEvaluator:
     def __init__(self, gemini_api_key):
@@ -314,142 +290,53 @@ class BenchmarkEvaluator:
         return benchmark_df
-def evaluate_single_response(gemini_api_key, prompt, response, model_name="Test Model"):
-    """Evaluate a single response for the UI"""
-    # Create a temporary dataframe
-    df = pd.DataFrame({
-        'rus_prompt': [prompt],
-        f'{model_name}_answers': [response]
-    })
-    evaluator = BenchmarkEvaluator(gemini_api_key)
-    try:
-        result = evaluator.evaluate_model(df, model_name)
-        # Format the result for displaying in UI
-        output = {
-            'Creativity Score': f"{result['creative_details']['creativity']:.2f}",
-            'Diversity Score': f"{result['creative_details']['diversity']:.2f}",
-            'Relevance Score': f"{result['creative_details']['relevance']:.2f}",
-            'Average Creative Score': f"{result['creativity_score']:.2f}",
-            'Stability Score': f"{result['stability_score']:.2f}",
-            'Combined Score': f"{result['combined_score']:.2f}"
-        }
-        return output
-    except Exception as e:
-        return {
-            'Error': str(e)
-        }
-def evaluate_batch(api_key, file, prompt_column, models_text):
-    """Process batch evaluation from the UI"""
-    try:
-        # Load the CSV file
-        file_path = file.name
-        df = pd.read_csv(file_path)
-        # Process model names if provided
-        models = None
-        if models_text.strip():
-            models = [m.strip() for m in models_text.split(',')]
-        # Run the evaluation
-        evaluator = BenchmarkEvaluator(api_key)
-        results = evaluator.evaluate_all_models(df, models, prompt_column)
-        return results
-    except Exception as e:
-        return pd.DataFrame({'Error': [str(e)]})
 def create_gradio_interface():
-    """Create Gradio interface for evaluation app"""
     with gr.Blocks(title="Model Response Evaluator") as app:
         gr.Markdown("# Model Response Evaluator")
-        gr.Markdown("Evaluate model responses for creativity, diversity, relevance, and stability.")
-        with gr.Tab("Single Response Evaluation"):
-            with gr.Row():
-                gemini_api_key = gr.Textbox(label="Gemini API Key", type="password")
-            with gr.Row():
-                with gr.Column():
-                    prompt = gr.Textbox(label="Original Prompt", lines=3)
-                    response = gr.Textbox(label="Model Response", lines=6)
-                    model_name = gr.Textbox(label="Model Name", value="Test Model")
-                    evaluate_btn = gr.Button("Evaluate Response")
-                with gr.Column():
-                    output = gr.JSON(label="Evaluation Results")
-            evaluate_btn.click(
-                evaluate_single_response,
-                inputs=[gemini_api_key, prompt, response, model_name],
-                outputs=output
-            )
-        with gr.Tab("Batch Evaluation"):
-            with gr.Row():
-                gemini_api_key_batch = gr.Textbox(label="Gemini API Key", type="password")
-            with gr.Row():
-                csv_file = gr.File(label="Upload CSV with responses")
-                prompt_col = gr.Textbox(label="Prompt Column Name", value="rus_prompt")
-                models_input = gr.Textbox(label="Model names (comma-separated, leave blank for auto-detection)")
-            evaluate_batch_btn = gr.Button("Run Benchmark")
             benchmark_output = gr.DataFrame(label="Benchmark Results")
-            evaluate_batch_btn.click(
-                evaluate_batch,
-                inputs=[gemini_api_key_batch, csv_file, prompt_col, models_input],
-                outputs=benchmark_output
-            )
-        # Add a new tab for configuration settings
-        with gr.Tab("Configuration"):
-            gr.Markdown("## Advanced Configuration")
-            gr.Markdown("Adjust evaluation parameters to customize the benchmarking process.")
-            with gr.Row():
-                batch_size = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Batch Size")
-                retry_attempts = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Retry Attempts")
-            with gr.Row():
-                min_wait = gr.Slider(minimum=1, maximum=30, value=4, step=1, label="Minimum Wait Time (seconds)")
-                max_wait = gr.Slider(minimum=10, maximum=300, value=60, step=10, label="Maximum Wait Time (seconds)")
-            with gr.Row():
-                gemini_model = gr.Dropdown(
-                    choices=["gemini-1.5-flash", "gemini-1.5-pro", "gemini-1.5-ultra"],
-                    value="gemini-1.5-flash",
-                    label="Gemini Model"
-                )
-            gr.Markdown("Note: Changes to configuration settings will apply to new evaluations.")
-            def update_config(batch_size, retry_attempts, min_wait, max_wait, gemini_model):
-                # This function doesn't actually do anything in the demo but would update global config
-                return f"Configuration updated: batch_size={batch_size}, retry_attempts={retry_attempts}, min_wait={min_wait}, max_wait={max_wait}, model={gemini_model}"
-            update_config_btn = gr.Button("Update Configuration")
-            config_status = gr.Textbox(label="Status", interactive=False)
-            update_config_btn.click(
-                update_config,
-                inputs=[batch_size, retry_attempts, min_wait, max_wait, gemini_model],
-                outputs=config_status
-            )
     return app
 def main():
-    """Main function to run the application"""
     app = create_gradio_interface()
     app.launch(share=True)

 import warnings
 import time
 from typing import Dict, Tuple, List
 from dataclasses import dataclass
 import numpy as np
 import pandas as pd
     api_key: str
     model_name: str = "gemini-1.5-flash"
     batch_size: int = 5
 class EvaluationPrompts:
     @staticmethod
 Выведите оценки в точном формате:
 Креативность: [число]
 Разнообразие: [число]
+Релевантность: [число]"""
     @staticmethod
     def get_third_check(original_prompt: str, response: str) -> str:
             'individual_similarities': stability_coefficients
         }
 class BenchmarkEvaluator:
     def __init__(self, gemini_api_key):
         return benchmark_df
 def create_gradio_interface():
     with gr.Blocks(title="Model Response Evaluator") as app:
         gr.Markdown("# Model Response Evaluator")
+        gr.Markdown("Upload a CSV file with prompts and model responses to evaluate and benchmark models.")
+        with gr.Row():
+            gemini_api_key = gr.Textbox(label="Gemini API Key", type="password")
+        with gr.Row():
+            csv_file = gr.File(label="Upload CSV with responses")
+            prompt_col = gr.Textbox(label="Prompt Column Name", value="rus_prompt")
+            models_input = gr.Textbox(label="Model names (comma-separated, leave blank for auto-detection)")
+        evaluate_btn = gr.Button("Run Benchmark")
+        with gr.Row():
             benchmark_output = gr.DataFrame(label="Benchmark Results")
+        def evaluate_batch(api_key, file, prompt_column, models_text):
+            try:
+                # Load the CSV file
+                file_path = file.name
+                df = pd.read_csv(file_path)
+                # Process model names if provided
+                models = None
+                if models_text.strip():
+                    models = [m.strip() for m in models_text.split(',')]
+                # Run the evaluation
+                evaluator = BenchmarkEvaluator(api_key)
+                results = evaluator.evaluate_all_models(df, models, prompt_column)
+                return results
+            except Exception as e:
+                return pd.DataFrame({'Error': [str(e)]})
+        evaluate_btn.click(
+            evaluate_batch,
+            inputs=[gemini_api_key, csv_file, prompt_col, models_input],
+            outputs=benchmark_output
+        )
     return app
 def main():
     app = create_gradio_interface()
     app.launch(share=True)