Spaces:

AutoBench
/

AutoBench_1.0_Demo

Running

App Files Files Community

PeterKruger commited on Feb 28

Commit

f0f4459

verified ·

1 Parent(s): 8731f61

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -5

app.py CHANGED Viewed

@@ -852,8 +852,8 @@ def check_model_availability(models, token):
     return availability_results
 # Streamlit UI
-st.title("AutoBench 1.0 Lite")
-st.write("A Many-Model-As-Judge system that will generate a customizable LLM benchmark (Lite is just a simple trial version).\nChose the models you want to evaluate (at least 2) and they will rank each other against the selected topics. But, first, check if models are available (this will depend on your Hugging face account. Premium is strongly recomended to avoid unresponsive models)")
 # Setup sidebar for configuration
 st.sidebar.header("Configuration")
@@ -867,7 +867,6 @@ st.sidebar.subheader("Models")
 available_models = [
     "meta-llama/Llama-3.3-70B-Instruct",
     "meta-llama/Llama-3.1-70B-Instruct",
-    "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
     "mistralai/Mistral-7B-Instruct-v0.2",
     "mistralai/Mixtral-8x7B-Instruct-v0.1",
     "Qwen/Qwen2.5-72B-Instruct",
@@ -953,7 +952,7 @@ if st.sidebar.button("Start Benchmark"):
         # Run the benchmark
         try:
             # Update status
-            status_text.text("Benchmark running...\n\nThis will take a few minutes depending on the number of models and iterations chosen. If you want to follow progress in the question-answer-ranking process per each iteration, check the container log (above, next to the \"running\" button)")
             # Run benchmark and get results
             results, cumulative_avg_rank, total_successful = run_benchmark(
@@ -964,7 +963,7 @@ if st.sidebar.button("Start Benchmark"):
             # Update progress to complete
             progress_bar.progress(100)
-            status_text.text(f"Benchmark completed! {total_successful} successful iterations")
             # Display results
             if total_successful > 0:

     return availability_results
 # Streamlit UI
+st.title("AutoBench 1.0 Demo")
+st.write("A Many-Model-As-Judge system that will generate a customizable LLM benchmark. AutoBench 1.0 Demo is just a simple trial version for educational purposes. Please refer to the AutoBench 1.0 repository for any advanced use. \n\nChose the models you want to evaluate (at least 2). The models will rank each other against the selected topics. But, first, check if models are available (this will depend on your Hugging face account. Premium is strongly recommended to avoid unresponsive models). Consult the README file for troubleshooting.")
 # Setup sidebar for configuration
 st.sidebar.header("Configuration")
 available_models = [
     "meta-llama/Llama-3.3-70B-Instruct",
     "meta-llama/Llama-3.1-70B-Instruct",
     "mistralai/Mistral-7B-Instruct-v0.2",
     "mistralai/Mixtral-8x7B-Instruct-v0.1",
     "Qwen/Qwen2.5-72B-Instruct",
         # Run the benchmark
         try:
             # Update status
+            status_text.text("Benchmark running...\n\nThis will take a few minutes depending on the number of models and iterations chosen. If you want to follow progress in the question-answer-ranking process per each iteration, check the container log (above, next to the \"running\" button).")
             # Run benchmark and get results
             results, cumulative_avg_rank, total_successful = run_benchmark(
             # Update progress to complete
             progress_bar.progress(100)
+            status_text.text(f"Benchmark completed! {total_successful} successful iterations.")
             # Display results
             if total_successful > 0: