Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -113,18 +113,27 @@ def evaluate_models(file_path, api_key, prompt_col, selected_models=None, progre
|
|
113 |
combined_benchmark_path = f'results/benchmark_results_{timestamp}.csv'
|
114 |
benchmark_df.to_csv(combined_benchmark_path, index=False)
|
115 |
|
116 |
-
# Create visualizations
|
117 |
progress(0.95, desc="Creating visualizations...")
|
118 |
radar_chart_path = create_radar_chart(all_results)
|
119 |
bar_chart_path = create_bar_chart(all_results)
|
120 |
|
121 |
progress(1.0, desc="Evaluation complete!")
|
122 |
|
123 |
-
# Sort results by combined score
|
124 |
sorted_results = benchmark_df.sort_values(by='combined_score', ascending=False)
|
125 |
|
126 |
return sorted_results, radar_chart_path, bar_chart_path, combined_benchmark_path
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
def create_gradio_interface():
|
129 |
with gr.Blocks(title="LLM Evaluation Tool") as app:
|
130 |
gr.Markdown("# LLM Evaluation Tool")
|
@@ -188,11 +197,9 @@ def create_gradio_interface():
|
|
188 |
with gr.Row():
|
189 |
gr.Markdown("### Leaderboard Details")
|
190 |
gr.Markdown("""
|
191 |
-
- **Креативность**: Оригинальность и инновационность ответов
|
192 |
-
-
|
193 |
-
-
|
194 |
-
- **Стабильность**: Насколько хорошо модель сохраняет смысл и контекст запроса
|
195 |
-
- **Общий балл**: Среднее значение всех показателей
|
196 |
""")
|
197 |
|
198 |
return app
|
|
|
113 |
combined_benchmark_path = f'results/benchmark_results_{timestamp}.csv'
|
114 |
benchmark_df.to_csv(combined_benchmark_path, index=False)
|
115 |
|
|
|
116 |
progress(0.95, desc="Creating visualizations...")
|
117 |
radar_chart_path = create_radar_chart(all_results)
|
118 |
bar_chart_path = create_bar_chart(all_results)
|
119 |
|
120 |
progress(1.0, desc="Evaluation complete!")
|
121 |
|
|
|
122 |
sorted_results = benchmark_df.sort_values(by='combined_score', ascending=False)
|
123 |
|
124 |
return sorted_results, radar_chart_path, bar_chart_path, combined_benchmark_path
|
125 |
|
126 |
+
|
127 |
+
def get_leaderboard_data():
|
128 |
+
return [
|
129 |
+
["Vikhr", "7.75", "0.9363600260019302", "0.860"],
|
130 |
+
["Llama3", "7.30", "0.9410231244564057", "0.827"],
|
131 |
+
["Mistral", "6.95", "0.9459488660097122", "0.807"],
|
132 |
+
["Owen", "6.93", "0.945682458281517", "0.800"],
|
133 |
+
["TinyLlama", "1.12", "0.945682458281517", "0.573"]
|
134 |
+
]
|
135 |
+
|
136 |
+
|
137 |
def create_gradio_interface():
|
138 |
with gr.Blocks(title="LLM Evaluation Tool") as app:
|
139 |
gr.Markdown("# LLM Evaluation Tool")
|
|
|
197 |
with gr.Row():
|
198 |
gr.Markdown("### Leaderboard Details")
|
199 |
gr.Markdown("""
|
200 |
+
- **Креативность**: Оригинальность и инновационность ответов (шкала до 10)
|
201 |
+
- **Стабильность**: Коэффициент стабильности модели (0-1)
|
202 |
+
- **Общий балл**: Средний комбинированный показатель производительности (0-1)
|
|
|
|
|
203 |
""")
|
204 |
|
205 |
return app
|