Spaces:

AutoBench
/

AutoBench_1.0_Demo

Running

App Files Files Community

PeterKruger commited on Feb 28

Commit

1a66bbf

verified ·

1 Parent(s): 9d9a69a

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -21

app.py CHANGED Viewed

@@ -64,7 +64,7 @@ def retry_api_request(max_retries=3, wait_time=10):
 # --- Single model request function for Hugging Face ---
 @retry_api_request()
-def make_hf_request(model_name, messages, temperature, max_tokens):
     """
     Send request to a Hugging Face model using InferenceClient
@@ -73,11 +73,12 @@ def make_hf_request(model_name, messages, temperature, max_tokens):
         messages: Messages in the format [{"role": "user", "content": "..."}]
         temperature: Temperature parameter for generation
         max_tokens: Maximum tokens to generate
     Returns:
         Generated text or None if request fails
     """
-    client = InferenceClient(model=model_name)
     # Convert messages to a prompt string
     prompt = ""
@@ -304,7 +305,6 @@ def rank_answer_prompt(question, answer, topic):
 3: Good answer - clear, relevant to the topic, well-formulated, with correct statements. For creative writing, this includes demonstrating good originality, imagination, and adherence to the prompt, including the 3000-character limit.
 4: Very good answer - very clear, very relevant to the topic, expertly formulated, with highly correct statements.  For creative writing, shows strong originality, a compelling narrative or poetic voice, and excellent adherence to the prompt, including the 3000-character limit.
 5: Exceptionally good answer - only appliable to exceptional answers that match all the criteria of the previous "4: Very good answer", but also bring additional unique insights, perfectly sound original arguments, or other exceptional unexpected contributions to the topic. For creative writing, this indicates a truly outstanding piece of writing with exceptional creativity, emotional resonance, and masterful execution, while adhering to the 3000-character limit.
 Consider these criteria in your ranking:
 - Clarity: Is the answer easy to understand? Is it ambiguous or confusing?
 - Relevance: Is the answer relevant to the specified topic?
@@ -323,7 +323,6 @@ Consider these criteria in your ranking:
     prompt += f"""
 Just return a single number (the rank from 1 to 5), do not add any other text.
 Question: {question}
 Answer: {answer}
 Rank:"""
@@ -345,7 +344,6 @@ def rank_question_prompt(question, topic, difficulty):
 3: Good question - clear, relevant to the topic, generally appropriate for the difficulty level, and reasonably well-formulated. For creative writing, the prompt is clear, provides a reasonable starting point for creative work, and sets a clear 3000-character limit.
 4: Very good question - clear, highly relevant to the topic, appropriate for the difficulty level, and well-formulated.  For creative writing, the prompt is engaging, sparks imagination, and offers a good balance of direction and freedom, with a clear 3000-character limit.
 5: Excellent question - exceptionally clear, insightful, highly relevant to the topic, perfectly matched to the difficulty level, and expertly formulated. For creative writing, the prompt is exceptionally creative, thought-provoking, and likely to inspire high-quality writing, with a clear 3000-character limit.
 Consider these criteria in your ranking:
 - Clarity: Is the question easy to understand? Is it ambiguous or confusing?
 - Relevance: Is the question relevant to the specified topic ({topic})?
@@ -361,7 +359,6 @@ Consider these criteria in your ranking:
 """
     prompt += f"""
 Just return a single number (the rank from 1 to 5), do not add any other text.
 Question: {question}
 Rank:"""
     return prompt
@@ -385,14 +382,14 @@ def parse_rank_string(rank_str, ranking_model_id):
         return None
 # --- Helper Function for Parallel Ranking ---
-def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, timeout=60):
     start_time = time.time()
     rank = None # Initialize rank to None, indicating potential failure
     rank_prompt = rank_answer_prompt(question, answer, topic)
     try:
-        response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5)
         if response:
             try:
                 rank_str = response.strip()
@@ -416,14 +413,14 @@ def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures
     return ranking_model_id, rank
 # --- Helper Function for Parallel Ranking of questions ---
-def get_question_rank_from_model(ranking_model_id, question, topic, difficulty, consecutive_failures, failure_threshold, unresponsive_models, model_config, timeout=60):
     start_time = time.time()
     rank = None # Initialize rank to None, indicating potential failure
     rank_prompt = rank_question_prompt(question, topic, difficulty) # Use question rank prompt
     try:
-        response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5)
         if response:
             try:
                 rank_str = response.strip()
@@ -447,7 +444,7 @@ def get_question_rank_from_model(ranking_model_id, question, topic, difficulty,
     return ranking_model_id, rank
 # --- Helper Function for Parallel Answering ---
-def get_answer_from_model(model_id, question, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, timeout=60):
     start_time = time.time() # Start timer
     answer_prompt = answer_question_prompt(question)
     answer = "Error answering" # Default answer
@@ -459,7 +456,7 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
         max_tok = long_max_tokens
     try:
-        response = make_hf_request(model_config[model_id]["name"], [{"role": "user", "content": answer_prompt}], temp, max_tok)
         if response:
             answer = response.strip()
     except Exception as e:
@@ -475,7 +472,7 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
     return answer, duration # Return answer and duration
 # --- Core Logic ---
-def run_benchmark(hf_models, topics, difficulties, t, model_config):
     results = {
         "model_name": [],
         "topic": [],
@@ -563,7 +560,8 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
             response = make_hf_request(model_config[question_generator_model_id]["name"],
                                       [{"role": "user", "content": question_prompt}],
                                       question_temp,
-                                      question_max_tokens)
             if response:
                 question = response.strip()
@@ -603,6 +601,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
                         failure_threshold,
                         unresponsive_models,
                         model_config,
                         timeout=60
                     )
                     question_ranking_futures.append(future)
@@ -667,6 +666,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
                             unresponsive_models,
                             model_config,
                             topic,
                             timeout=60
                         )
                         answer_futures.append(future)
@@ -726,6 +726,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
                             unresponsive_models,
                             model_config,
                             topic,
                             timeout=60
                         )
                         ranking_futures.append(future)
@@ -870,12 +871,6 @@ if st.sidebar.button("Start Benchmark"):
         if 'results_df' not in st.session_state:
             st.session_state.results_df = pd.DataFrame()
-        # Modify make_hf_request to use the token
-        def make_hf_request_with_token(model_name, messages, temperature, max_tokens):
-            client = InferenceClient(model=model_name, token=hf_token)
-            # Rest of the function is the same...
-            # Return response
         # Run the benchmark
         try:
             # Update status
@@ -885,7 +880,7 @@ if st.sidebar.button("Start Benchmark"):
             results, cumulative_avg_rank, total_successful = run_benchmark(
                 selected_models, selected_topics,
                 ["a very simple", "a simple", "a", "a difficult", "a very difficult"],
-                num_iterations, model_config
             )
             # Update progress to complete

 # --- Single model request function for Hugging Face ---
 @retry_api_request()
+def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
     """
     Send request to a Hugging Face model using InferenceClient
         messages: Messages in the format [{"role": "user", "content": "..."}]
         temperature: Temperature parameter for generation
         max_tokens: Maximum tokens to generate
+        token: Hugging Face API token
     Returns:
         Generated text or None if request fails
     """
+    client = InferenceClient(model=model_name, token=token)
     # Convert messages to a prompt string
     prompt = ""
 3: Good answer - clear, relevant to the topic, well-formulated, with correct statements. For creative writing, this includes demonstrating good originality, imagination, and adherence to the prompt, including the 3000-character limit.
 4: Very good answer - very clear, very relevant to the topic, expertly formulated, with highly correct statements.  For creative writing, shows strong originality, a compelling narrative or poetic voice, and excellent adherence to the prompt, including the 3000-character limit.
 5: Exceptionally good answer - only appliable to exceptional answers that match all the criteria of the previous "4: Very good answer", but also bring additional unique insights, perfectly sound original arguments, or other exceptional unexpected contributions to the topic. For creative writing, this indicates a truly outstanding piece of writing with exceptional creativity, emotional resonance, and masterful execution, while adhering to the 3000-character limit.
 Consider these criteria in your ranking:
 - Clarity: Is the answer easy to understand? Is it ambiguous or confusing?
 - Relevance: Is the answer relevant to the specified topic?
     prompt += f"""
 Just return a single number (the rank from 1 to 5), do not add any other text.
 Question: {question}
 Answer: {answer}
 Rank:"""
 3: Good question - clear, relevant to the topic, generally appropriate for the difficulty level, and reasonably well-formulated. For creative writing, the prompt is clear, provides a reasonable starting point for creative work, and sets a clear 3000-character limit.
 4: Very good question - clear, highly relevant to the topic, appropriate for the difficulty level, and well-formulated.  For creative writing, the prompt is engaging, sparks imagination, and offers a good balance of direction and freedom, with a clear 3000-character limit.
 5: Excellent question - exceptionally clear, insightful, highly relevant to the topic, perfectly matched to the difficulty level, and expertly formulated. For creative writing, the prompt is exceptionally creative, thought-provoking, and likely to inspire high-quality writing, with a clear 3000-character limit.
 Consider these criteria in your ranking:
 - Clarity: Is the question easy to understand? Is it ambiguous or confusing?
 - Relevance: Is the question relevant to the specified topic ({topic})?
 """
     prompt += f"""
 Just return a single number (the rank from 1 to 5), do not add any other text.
 Question: {question}
 Rank:"""
     return prompt
         return None
 # --- Helper Function for Parallel Ranking ---
+def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, token=None, timeout=60):
     start_time = time.time()
     rank = None # Initialize rank to None, indicating potential failure
     rank_prompt = rank_answer_prompt(question, answer, topic)
     try:
+        response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5, token=token)
         if response:
             try:
                 rank_str = response.strip()
     return ranking_model_id, rank
 # --- Helper Function for Parallel Ranking of questions ---
+def get_question_rank_from_model(ranking_model_id, question, topic, difficulty, consecutive_failures, failure_threshold, unresponsive_models, model_config, token=None, timeout=60):
     start_time = time.time()
     rank = None # Initialize rank to None, indicating potential failure
     rank_prompt = rank_question_prompt(question, topic, difficulty) # Use question rank prompt
     try:
+        response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5, token=token)
         if response:
             try:
                 rank_str = response.strip()
     return ranking_model_id, rank
 # --- Helper Function for Parallel Answering ---
+def get_answer_from_model(model_id, question, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, token=None, timeout=60):
     start_time = time.time() # Start timer
     answer_prompt = answer_question_prompt(question)
     answer = "Error answering" # Default answer
         max_tok = long_max_tokens
     try:
+        response = make_hf_request(model_config[model_id]["name"], [{"role": "user", "content": answer_prompt}], temp, max_tok, token=token)
         if response:
             answer = response.strip()
     except Exception as e:
     return answer, duration # Return answer and duration
 # --- Core Logic ---
+def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
     results = {
         "model_name": [],
         "topic": [],
             response = make_hf_request(model_config[question_generator_model_id]["name"],
                                       [{"role": "user", "content": question_prompt}],
                                       question_temp,
+                                      question_max_tokens,
+                                      token=token)
             if response:
                 question = response.strip()
                         failure_threshold,
                         unresponsive_models,
                         model_config,
+                        token,
                         timeout=60
                     )
                     question_ranking_futures.append(future)
                             unresponsive_models,
                             model_config,
                             topic,
+                            token,
                             timeout=60
                         )
                         answer_futures.append(future)
                             unresponsive_models,
                             model_config,
                             topic,
+                            token,
                             timeout=60
                         )
                         ranking_futures.append(future)
         if 'results_df' not in st.session_state:
             st.session_state.results_df = pd.DataFrame()
         # Run the benchmark
         try:
             # Update status
             results, cumulative_avg_rank, total_successful = run_benchmark(
                 selected_models, selected_topics,
                 ["a very simple", "a simple", "a", "a difficult", "a very difficult"],
+                num_iterations, model_config, hf_token
             )
             # Update progress to complete