Spaces:

AutoBench
/

AutoBench_1.0_Demo

Running

PeterKruger commited on Feb 28

Commit

5c21f77

verified ·

1 Parent(s): e0e94ac

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -477,7 +477,6 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
     results = {
         "model_name": [],
         "topic": [],
-        "difficulty": [],
         "question_prompt": [],
         "question": [],
         "answer": [],
@@ -759,7 +758,6 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             results["model_name"].append(model_id)
             results["topic"].append(topic)
-            results["difficulty"].append(difficulty)
             results["question_prompt"].append(question_prompt)
             results["question"].append(question)
             results["answer"].append(answer)
@@ -851,7 +849,7 @@ def check_model_availability(models, token):
     return availability_results
 # Streamlit UI
-st.title("LLM Benchmark")
 # Setup sidebar for configuration
 st.sidebar.header("Configuration")
@@ -950,6 +948,7 @@ if st.sidebar.button("Start Benchmark"):
         try:
             # Update status
             status_text.text("Benchmark running...")
             # Run benchmark and get results
             results, cumulative_avg_rank, total_successful = run_benchmark(

     results = {
         "model_name": [],
         "topic": [],
         "question_prompt": [],
         "question": [],
         "answer": [],
             results["model_name"].append(model_id)
             results["topic"].append(topic)
             results["question_prompt"].append(question_prompt)
             results["question"].append(question)
             results["answer"].append(answer)
     return availability_results
 # Streamlit UI
+st.title("AutoBench 1.0")
 # Setup sidebar for configuration
 st.sidebar.header("Configuration")
         try:
             # Update status
             status_text.text("Benchmark running...")
+            status_text.text("This will take a few minutes depending on the number of models and iterations chosen. If you want to follow progress in the question-answer-ranking process per each iteration, check the container log (above, next to the \"running\" button)
             # Run benchmark and get results
             results, cumulative_avg_rank, total_successful = run_benchmark(