PeterKruger commited on
Commit
f0f4459
·
verified ·
1 Parent(s): 8731f61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -5
app.py CHANGED
@@ -852,8 +852,8 @@ def check_model_availability(models, token):
852
  return availability_results
853
 
854
  # Streamlit UI
855
- st.title("AutoBench 1.0 Lite")
856
- st.write("A Many-Model-As-Judge system that will generate a customizable LLM benchmark (Lite is just a simple trial version).\nChose the models you want to evaluate (at least 2) and they will rank each other against the selected topics. But, first, check if models are available (this will depend on your Hugging face account. Premium is strongly recomended to avoid unresponsive models)")
857
 
858
  # Setup sidebar for configuration
859
  st.sidebar.header("Configuration")
@@ -867,7 +867,6 @@ st.sidebar.subheader("Models")
867
  available_models = [
868
  "meta-llama/Llama-3.3-70B-Instruct",
869
  "meta-llama/Llama-3.1-70B-Instruct",
870
- "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
871
  "mistralai/Mistral-7B-Instruct-v0.2",
872
  "mistralai/Mixtral-8x7B-Instruct-v0.1",
873
  "Qwen/Qwen2.5-72B-Instruct",
@@ -953,7 +952,7 @@ if st.sidebar.button("Start Benchmark"):
953
  # Run the benchmark
954
  try:
955
  # Update status
956
- status_text.text("Benchmark running...\n\nThis will take a few minutes depending on the number of models and iterations chosen. If you want to follow progress in the question-answer-ranking process per each iteration, check the container log (above, next to the \"running\" button)")
957
 
958
  # Run benchmark and get results
959
  results, cumulative_avg_rank, total_successful = run_benchmark(
@@ -964,7 +963,7 @@ if st.sidebar.button("Start Benchmark"):
964
 
965
  # Update progress to complete
966
  progress_bar.progress(100)
967
- status_text.text(f"Benchmark completed! {total_successful} successful iterations")
968
 
969
  # Display results
970
  if total_successful > 0:
 
852
  return availability_results
853
 
854
  # Streamlit UI
855
+ st.title("AutoBench 1.0 Demo")
856
+ st.write("A Many-Model-As-Judge system that will generate a customizable LLM benchmark. AutoBench 1.0 Demo is just a simple trial version for educational purposes. Please refer to the AutoBench 1.0 repository for any advanced use. \n\nChose the models you want to evaluate (at least 2). The models will rank each other against the selected topics. But, first, check if models are available (this will depend on your Hugging face account. Premium is strongly recommended to avoid unresponsive models). Consult the README file for troubleshooting.")
857
 
858
  # Setup sidebar for configuration
859
  st.sidebar.header("Configuration")
 
867
  available_models = [
868
  "meta-llama/Llama-3.3-70B-Instruct",
869
  "meta-llama/Llama-3.1-70B-Instruct",
 
870
  "mistralai/Mistral-7B-Instruct-v0.2",
871
  "mistralai/Mixtral-8x7B-Instruct-v0.1",
872
  "Qwen/Qwen2.5-72B-Instruct",
 
952
  # Run the benchmark
953
  try:
954
  # Update status
955
+ status_text.text("Benchmark running...\n\nThis will take a few minutes depending on the number of models and iterations chosen. If you want to follow progress in the question-answer-ranking process per each iteration, check the container log (above, next to the \"running\" button).")
956
 
957
  # Run benchmark and get results
958
  results, cumulative_avg_rank, total_successful = run_benchmark(
 
963
 
964
  # Update progress to complete
965
  progress_bar.progress(100)
966
+ status_text.text(f"Benchmark completed! {total_successful} successful iterations.")
967
 
968
  # Display results
969
  if total_successful > 0: