HumanLikeness

Sleeping

App Files Files Community

XufengDuan commited on Aug 19, 2024

Commit

e157bd5

1 Parent(s): dcbdce4

update scripts

Browse files

Files changed (7) hide show

app.py +1 -1
src/backend/evaluate_model.py +27 -11
src/backend/model_operations.py +211 -86
src/backend/util.py +56 -22
src/display/about.py +33 -40
src/envs.py +22 -1
src/populate.py +8 -4

app.py CHANGED Viewed

@@ -457,7 +457,7 @@ with demo:
 def background_init_and_process():
     global original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
     original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
-    process_pending_evals()
 scheduler = BackgroundScheduler()
 scheduler.add_job(background_init_and_process, 'date', run_date=datetime.datetime.now())  # 立即执行

 def background_init_and_process():
     global original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
     original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
+    #process_pending_evals()
 scheduler = BackgroundScheduler()
 scheduler.add_job(background_init_and_process, 'date', run_date=datetime.datetime.now())  # 立即执行

src/backend/evaluate_model.py CHANGED Viewed

@@ -95,23 +95,39 @@ class Evaluator:
             '''开始评估模型的结果'''
             self.humanlike = self.eval_model.evaluate_humanlike(self.generated_summaries_df, envs.HUMAN_DATA, f"./generation_results/{self.model}.csv")
             '''原始指标'''
             # self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
                 # self.generated_summaries_df)
             # factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
             # hallucination_rate = self.eval_model.hallucination_rate
-            factual_consistency_rate = 0
-            answer_rate = 0
-            avg_summary_len = 0
-            results = util.format_results(model_name=self.model, revision=self.revision,
-                                        precision=self.precision,
-                                        factual_consistency_rate=factual_consistency_rate,
-                                        hallucination_rate=self.humanlike,
-                                        answer_rate=answer_rate,
-                                        avg_summary_len=avg_summary_len)
             return results
         except FileNotFoundError:
             logging.error(f"File not found: {envs.DATASET_PATH}")

             '''开始评估模型的结果'''
             self.humanlike = self.eval_model.evaluate_humanlike(self.generated_summaries_df, envs.HUMAN_DATA, f"./generation_results/{self.model}.csv")
+            all_results = self.humanlike
+            # Prepare individual experiment scores and CIs
+            experiment_results = {}
+            for exp, data in all_results['per_experiment'].items():
+                experiment_results[f'{exp}'] = data['average_js_divergence']
+                experiment_results[f'{exp}_ci'] = data['confidence_interval']
+            # Write results into results using util.format_results
+            results = util.format_results(
+                model_name=self.model,
+                revision=self.revision,
+                precision=self.precision,
+                overall_js=all_results['overall']['average_js_divergence'],
+                overall_ci=all_results['overall']['confidence_interval'],
+                **experiment_results  # Unpack the experiment results
+            )
             '''原始指标'''
             # self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
                 # self.generated_summaries_df)
             # factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
             # hallucination_rate = self.eval_model.hallucination_rate
+            # factual_consistency_rate = 0
+            # answer_rate = 0
+            # avg_summary_len = 0
+            #
+            # results = util.format_results(model_name=self.model, revision=self.revision,
+            #                             precision=self.precision,
+            #                             factual_consistency_rate=factual_consistency_rate,
+            #                             hallucination_rate=self.humanlike,
+            #                             answer_rate=answer_rate,
+            #                             avg_summary_len=avg_summary_len)
             return results
         except FileNotFoundError:
             logging.error(f"File not found: {envs.DATASET_PATH}")

src/backend/model_operations.py CHANGED Viewed

@@ -28,6 +28,7 @@ import src.envs as envs
 # # import pandas as pd
 # import scipy
 from scipy.spatial.distance import jensenshannon
 import numpy as np
 import spacy_transformers
@@ -238,7 +239,6 @@ class SummaryGenerator:
                             def extract_responses(text, trigger_words=None):
                                 if trigger_words is None:
-                                    # 如果没有提供特定的触发词列表，则使用默认值
                                     trigger_words = ["sure", "okay", "yes"]
                                 try:
@@ -248,7 +248,7 @@ class SummaryGenerator:
                                     sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence for
                                                  sentence in sentences]
-                                    if any(sentences[0].lower().startswith(word) for word in trigger_words):
                                         _response1 = sentences[1].strip() if len(sentences) > 1 else None
                                         _response2 = sentences[2].strip() if len(sentences) > 2 else None
                                     else:
@@ -279,10 +279,8 @@ class SummaryGenerator:
                             Experiment_ID.append(ID)
                             Questions_ID.append(q_column[j])
                             User_prompt.append(_user_prompt)
                             Response.append(_response2)
-                            Factor_2.append(V2_column[j])
                             Stimuli_1.append(Stimuli_2_column[j])
                             Item_ID.append(Item_column[j])
                             Condition.append(Condition_column[j])
@@ -292,10 +290,7 @@ class SummaryGenerator:
                             Questions_ID.append(str(q_column[j]) + '1')
                             User_prompt.append(_user_prompt)
                             Response.append(_response1)
-                            Factor_2.append(V2_column[j])
                             Stimuli_1.append(Stimuli_1_column[j])
                             Item_ID.append(Item_column[j])
                             Condition.append(Condition_column[j])
@@ -343,7 +338,7 @@ class SummaryGenerator:
         together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm']
         for together_ai_api_model in together_ai_api_models:
             if together_ai_api_model in self.model_id.lower():
-                using_together_api = True
                 break
         # print('适用哪一种LLM',together_ai_api_model , using_together_api)
         # print(self.model_id.lower()) #meta-llama/llama-2-7b-chat-hf
@@ -358,7 +353,7 @@ class SummaryGenerator:
             payload = {
                 "model": self.model_id,
                 # "max_tokens": 4096,
-                'max_new_tokens': 50,
                 # "temperature": 0.0,
                 # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
             }
@@ -408,7 +403,7 @@ class SummaryGenerator:
                 )
                 generation_args = {
-                    "max_new_tokens": 50,
                     "return_full_text": False,
                     #"temperature": 0.0,
                     "do_sample": False,
@@ -422,7 +417,7 @@ class SummaryGenerator:
                 print(prompt)
                 input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
                 with torch.no_grad():
-                    outputs = self.local_model.generate(**input_ids, max_new_tokens=50, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
                 result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                 result = result.replace(prompt[0], '')
                 print(result)
@@ -430,45 +425,83 @@ class SummaryGenerator:
         elif self.local_model is None:
-#             print(self.model_id)
-#             print(self.api_base)
-#             mistralai/Mistral-7B-Instruct-v0.1
-# https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
-            # Using HF API or download checkpoints
-            try: # try use HuggingFace API
-                from huggingface_hub import InferenceClient
-                print("token_for_request:",envs.TOKEN)
-                print(self.model_id)
-                client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
-                messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
-                # outputs = client.chat_completion(messages, max_tokens=50)
-                result = None
-                while result is None:
-                    outputs = client.chat_completion(messages, max_tokens=50)
-                    result = outputs['choices'][0]['message']['content']
-                    if result is None:
-                        time.sleep(1)  # Optional: Add a small delay before retrying
-                return result
-            except Exception as e:
-                print(f"Error with TOKEN: {envs.TOKEN}, trying with TOKEN1")
                 try:
-                    client = InferenceClient(self.model_id, api_key=envs.TOKEN1, headers={"X-use-cache": "false"})
                     messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
                     result = None
                     while result is None:
-                        outputs = client.chat_completion(messages, max_tokens=50)
                         result = outputs['choices'][0]['message']['content']
                         if result is None:
                             time.sleep(1)  # Optional: Add a small delay before retrying
                     return result
-                except Exception as ee:
-                    print(f"Error with TOKEN1: {envs.TOKEN1}")
-                    raise ee
             # except: # fail to call api. run it locally.
             #     self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
@@ -505,7 +538,7 @@ class SummaryGenerator:
                 "temperature": 0,
                 "top_p": 0.95,  # cannot change
                 "top_k": 0,
-                "max_output_tokens": 50,
                 # "response_mime_type": "application/json",
             }
             safety_settings = [
@@ -545,7 +578,7 @@ class SummaryGenerator:
                 messages=[{"role": "system", "content": system_prompt},
                         {"role": "user", "content": user_prompt}],
                 # temperature=0.0,
-                max_tokens=50,
                 api_key = os.getenv('OpenAI_key')
             )
             result = response['choices'][0]['message']['content']
@@ -643,8 +676,10 @@ class EvaluationModel:
             sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence
                          for sentence in sentences]
             rs = [sentence.strip() for sentence in sentences if sentence.strip()]
             '''Exp1'''
             if summaries_df["Experiment"][i] == "E1":
                 print("E1", rs)
                 rs = rs.replace('"','')
@@ -658,7 +693,7 @@ class EvaluationModel:
                 '''Exp2'''
             elif summaries_df["Experiment"][i] == "E2":
                 # rs = summaries_df["Response"][i].strip()
                 rs = rs.split(' ')
@@ -677,6 +712,7 @@ class EvaluationModel:
                     output.append("Other")
                 '''Exp3'''
             elif summaries_df["Experiment"][i] == "E3":
                 # rs = summaries_df["Response"][i].strip()
                 print("E3", rs)
@@ -906,18 +942,28 @@ class EvaluationModel:
         for i in range(len(summaries_df["Experiment"])):
             # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
             # print()
             if pd.isna(summaries_df["Response"][i]):
                 output.append("Other")
                 continue
             rs = summaries_df["Response"][i].strip().lower()
             '''Exp1'''
             if summaries_df["Experiment"][i] == "E1":
                 print("E1", rs)
-                rs = rs.replace('"','')
-                if rs == "round":
-                    # vote_1_1 += 1
                     output.append("Round")
-                elif rs == "spiky":
                     output.append("Spiky")
                 else:
                     output.append("Other")
@@ -926,7 +972,6 @@ class EvaluationModel:
                 '''Exp2'''
             elif summaries_df["Experiment"][i] == "E2":
-                # rs = summaries_df["Response"][i].strip()
                 rs = rs.split(' ')
                 print("E2", rs)
                 male, female = 0, 0
@@ -946,7 +991,7 @@ class EvaluationModel:
             elif summaries_df["Experiment"][i] == "E3":
                 # rs = summaries_df["Response"][i].strip()
                 print("E3", rs)
-                rs = rs.replace('"', '')
                 pair = summaries_df["Factor 2"][i]
                 word1, word2 = pair.split('_')
@@ -975,7 +1020,8 @@ class EvaluationModel:
                     print(f"Unexpected error: {e}")
                     output.append("Other")
                     continue
                 target = summaries_df["Factor 2"][i].strip().lower()
                 pair = target + "_" + meaning_word
                 print("E4:", pair)
@@ -1053,7 +1099,6 @@ class EvaluationModel:
                 doc = nlp1(sentence)
                 subject = "None"
                 obj = "None"
-                # 遍历依存关系，寻找主语和宾语
                 for token in doc:
                     if token.dep_ == "nsubj":
                         subject = token.text
@@ -1078,14 +1123,24 @@ class EvaluationModel:
                 '''Exp7'''
             elif summaries_df["Experiment"][i] == "E7":
-                # rs = summaries_df["Response"][i].strip().lower()
-                rs = rs.replace(".", "").replace(",", "")
-                print("E7",rs)
-                if rs == "no":
-                    output.append("0")
-                elif rs == "yes":
-                    output.append("1")
-                else:
                     output.append("Other")
                 '''Exp8'''
@@ -1136,14 +1191,17 @@ class EvaluationModel:
                 '''Exp10'''
             elif summaries_df["Experiment"][i] == "E10":
-                # rs = summaries_df["Response"][i].strip()
-                rs = rs.replace(".", "")
-                if rs == "yes":
                     output.append("1")
                 else:
                     output.append("0")
             else:
-                print("can;t find the Exp:", summaries_df["Experiment"][i])
                 output.append("NA")
             # print(output)
         # exit()
@@ -1207,6 +1265,7 @@ class EvaluationModel:
         human_df = pd.concat([human_df, human_e5], ignore_index=True)
         llm_df = pd.concat([llm_df, llm_e5], ignore_index=True)
         ### Calculate Average JS Divergence ###
         # Extract the relevant columns for JS divergence calculation
@@ -1216,14 +1275,14 @@ class EvaluationModel:
         # Get unique Question_IDs present in both datasets
         common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
-        # Initialize a list to store JS divergence for each Question_ID
-        js_divergence_list = []
-        js_divergence ={}
         # Calculate JS divergence for each common Question_ID
         for q_id in common_question_ids:
             # Get response distributions for the current Question_ID in both datasets
-            human_dist = human_responses[human_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
             llm_dist = llm_responses[llm_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
             # Reindex the distributions to have the same index, filling missing values with 0
@@ -1231,28 +1290,94 @@ class EvaluationModel:
             human_dist = human_dist.reindex(all_responses, fill_value=0)
             llm_dist = llm_dist.reindex(all_responses, fill_value=0)
-            # Calculate JS divergence and add to the list
             js_div = jensenshannon(human_dist, llm_dist, base=2)
             experiment_id = q_id.split('_')[1]
             if experiment_id not in js_divergence:
                 js_divergence[experiment_id] = []
             js_divergence[experiment_id].append(js_div)
-            js_divergence_list.append(js_div)
-            #js_divergence[q_id] = js_div
-        # Calculate the average JS divergence
-        # JS per experiment
-        avg_js_divergence_per_experiment = {exp: 1- np.nanmean(divs) for exp, divs in js_divergence.items()}
-        print(avg_js_divergence_per_experiment)
-        # JS overall
-        avg_js_divergence = 1 - np.nanmean(js_divergence_list)
-        print("avg_js_divergence:", avg_js_divergence)
-        return avg_js_divergence
     def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
@@ -1271,7 +1396,7 @@ class EvaluationModel:
         #     print(f'Save human coding results to {save_path}')
         #     fpath = Path(save_path)
         #     fpath.parent.mkdir(parents=True, exist_ok=True)
-        #     self.data.to_csv(fpath)
         '''coding llm data'''

 # # import pandas as pd
 # import scipy
 from scipy.spatial.distance import jensenshannon
+from scipy.stats import bootstrap
 import numpy as np
 import spacy_transformers
                             def extract_responses(text, trigger_words=None):
                                 if trigger_words is None:
                                     trigger_words = ["sure", "okay", "yes"]
                                 try:
                                     sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence for
                                                  sentence in sentences]
+                                    if any(sentences[0].lower().startswith(word) for word in trigger_words) and len(sentences)>2:
                                         _response1 = sentences[1].strip() if len(sentences) > 1 else None
                                         _response2 = sentences[2].strip() if len(sentences) > 2 else None
                                     else:
                             Experiment_ID.append(ID)
                             Questions_ID.append(q_column[j])
                             User_prompt.append(_user_prompt)
                             Response.append(_response2)
+                            Factor_2.append(_response)
                             Stimuli_1.append(Stimuli_2_column[j])
                             Item_ID.append(Item_column[j])
                             Condition.append(Condition_column[j])
                             Questions_ID.append(str(q_column[j]) + '1')
                             User_prompt.append(_user_prompt)
                             Response.append(_response1)
+                            Factor_2.append(_response)
                             Stimuli_1.append(Stimuli_1_column[j])
                             Item_ID.append(Item_column[j])
                             Condition.append(Condition_column[j])
         together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm']
         for together_ai_api_model in together_ai_api_models:
             if together_ai_api_model in self.model_id.lower():
+                #using_together_api = True
                 break
         # print('适用哪一种LLM',together_ai_api_model , using_together_api)
         # print(self.model_id.lower()) #meta-llama/llama-2-7b-chat-hf
             payload = {
                 "model": self.model_id,
                 # "max_tokens": 4096,
+                'max_new_tokens': 100,
                 # "temperature": 0.0,
                 # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
             }
                 )
                 generation_args = {
+                    "max_new_tokens": 100,
                     "return_full_text": False,
                     #"temperature": 0.0,
                     "do_sample": False,
                 print(prompt)
                 input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
                 with torch.no_grad():
+                    outputs = self.local_model.generate(**input_ids, max_new_tokens=100, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
                 result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                 result = result.replace(prompt[0], '')
                 print(result)
         elif self.local_model is None:
+            import random
+            def get_random_token():
+                i = random.randint(1, 20)
+                token = getattr(envs, f"TOKEN{i}")
+                return token, i
+            tokens_tried = set()
+            while len(tokens_tried) < 10:
+                token,i = get_random_token()
+                if token in tokens_tried:
+                    continue
+                tokens_tried.add(token)
+                print(f"Trying with token: TOKEN{i}")
                 try:
+                    from huggingface_hub import InferenceClient
+                    client = InferenceClient(self.model_id, api_key=token, headers={"X-use-cache": "false"})
                     messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
                     result = None
                     while result is None:
+                        outputs = client.chat_completion(messages, max_tokens=100)
                         result = outputs['choices'][0]['message']['content']
                         if result is None:
                             time.sleep(1)  # Optional: Add a small delay before retrying
                     return result
+                except Exception as e:
+                    print(f"Error with token: {token}, trying another token...")
+                    continue
+            raise Exception("All tokens failed.")
+#             print(self.model_id)
+#             print(self.api_base)
+#             mistralai/Mistral-7B-Instruct-v0.1
+# https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
+            # Using HF API or download checkpoints
+            # try:  # try use HuggingFace API
+            #     from huggingface_hub import InferenceClient
+            #     print("token_for_request:",envs.TOKEN)
+            #     print(self.model_id)
+            #     client = InferenceClient(self.model_id,api_key=envs.TOKEN,headers={"X-use-cache": "false"})
+            #     messages = [{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}]
+            #     # outputs = client.chat_completion(messages, max_tokens=100)
+            #     result = None
+            #     while result is None:
+            #         outputs = client.chat_completion(messages, max_tokens=100)
+            #         result = outputs['choices'][0]['message']['content']
+            #
+            #         if result is None:
+            #             time.sleep(1)  # Optional: Add a small delay before retrying
+            #
+            #     return result
+            #
+            # except Exception as e:
+            #     print(f"Error with TOKEN: {envs.TOKEN}, trying with TOKEN1")
+            #     try:
+            #         client = InferenceClient(self.model_id, api_key=envs.TOKEN1, headers={"X-use-cache": "false"})
+            #         messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+            #         result = None
+            #         while result is None:
+            #             outputs = client.chat_completion(messages, max_tokens=100)
+            #             result = outputs['choices'][0]['message']['content']
+            #
+            #             if result is None:
+            #                 time.sleep(1)  # Optional: Add a small delay before retrying
+            #
+            #         return result
+            #     except Exception as ee:
+            #         print(f"Error with TOKEN1: {envs.TOKEN1}")
+            #         raise ee
             # except: # fail to call api. run it locally.
             #     self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
                 "temperature": 0,
                 "top_p": 0.95,  # cannot change
                 "top_k": 0,
+                "max_output_tokens": 100,
                 # "response_mime_type": "application/json",
             }
             safety_settings = [
                 messages=[{"role": "system", "content": system_prompt},
                         {"role": "user", "content": user_prompt}],
                 # temperature=0.0,
+                max_tokens=100,
                 api_key = os.getenv('OpenAI_key')
             )
             result = response['choices'][0]['message']['content']
             sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence
                          for sentence in sentences]
             rs = [sentence.strip() for sentence in sentences if sentence.strip()]
+            rs = '\n'.join(rs)
+            rs = rs.replace("[", '').replace("]", '')
             '''Exp1'''
+            # period and comma will affect the result
             if summaries_df["Experiment"][i] == "E1":
                 print("E1", rs)
                 rs = rs.replace('"','')
                 '''Exp2'''
+                # not the first pronoun
             elif summaries_df["Experiment"][i] == "E2":
                 # rs = summaries_df["Response"][i].strip()
                 rs = rs.split(' ')
                     output.append("Other")
                 '''Exp3'''
+                #
             elif summaries_df["Experiment"][i] == "E3":
                 # rs = summaries_df["Response"][i].strip()
                 print("E3", rs)
         for i in range(len(summaries_df["Experiment"])):
             # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
             # print()
+            # data cleaning
             if pd.isna(summaries_df["Response"][i]):
                 output.append("Other")
                 continue
             rs = summaries_df["Response"][i].strip().lower()
+            sentences = rs.split('\n')
+            sentences = [sentence.split(':', 1)[-1].strip() if ':' in sentence else sentence
+                         for sentence in sentences]
+            rs = [sentence.strip() for sentence in sentences if sentence.strip()]
+            rs = '\n'.join(rs)
+            rs = rs.replace('[', '').replace(']','').replace('.','')
             '''Exp1'''
+            # the period and comma will affect the result
             if summaries_df["Experiment"][i] == "E1":
                 print("E1", rs)
+                rs = rs.replace('"', '')  # Remove any unnecessary quotation marks
+                rs_cleaned = rs.replace(',', '')  # Remove periods and commas
+                # Use 'contains' instead of 'equals' for keyword matching to avoid issues caused by punctuation
+                if "round" in rs_cleaned:
                     output.append("Round")
+                elif "spiky" in rs_cleaned:
                     output.append("Spiky")
                 else:
                     output.append("Other")
                 '''Exp2'''
             elif summaries_df["Experiment"][i] == "E2":
                 rs = rs.split(' ')
                 print("E2", rs)
                 male, female = 0, 0
             elif summaries_df["Experiment"][i] == "E3":
                 # rs = summaries_df["Response"][i].strip()
                 print("E3", rs)
+                rs = rs.replace('"', '').lower().replace(".","")
                 pair = summaries_df["Factor 2"][i]
                 word1, word2 = pair.split('_')
                     print(f"Unexpected error: {e}")
                     output.append("Other")
                     continue
+                meaning_word = meaning_word.replace('.', '')
+                meaning_word = meaning_word.replace(';', '')
                 target = summaries_df["Factor 2"][i].strip().lower()
                 pair = target + "_" + meaning_word
                 print("E4:", pair)
                 doc = nlp1(sentence)
                 subject = "None"
                 obj = "None"
                 for token in doc:
                     if token.dep_ == "nsubj":
                         subject = token.text
                 '''Exp7'''
             elif summaries_df["Experiment"][i] == "E7":
+            # Remove periods and commas, then convert to lowercase
+                rs = rs.replace(".", "").replace(",", "").lower()
+                print("E7", rs)
+                # Split the response into words
+                words = rs.split(' ')
+                found = False
+                for word in words:
+                    if word == "no":
+                        output.append("0")
+                        found = True
+                        break
+                    elif word == "yes":
+                        output.append("1")
+                        found = True
+                        break
+                if not found:
                     output.append("Other")
                 '''Exp8'''
                 '''Exp10'''
             elif summaries_df["Experiment"][i] == "E10":
+                # Remove periods from the response
+                rs = rs.replace(".", "").lower()  # Convert to lowercase to ensure case-insensitivity
+                print("E10", rs)
+                # Check if the response contains "yes"
+                if "yes" in rs:
                     output.append("1")
                 else:
                     output.append("0")
             else:
+                print("can’t find the Exp:", summaries_df["Experiment"][i])
                 output.append("NA")
             # print(output)
         # exit()
         human_df = pd.concat([human_df, human_e5], ignore_index=True)
         llm_df = pd.concat([llm_df, llm_e5], ignore_index=True)
         ### Calculate Average JS Divergence ###
         # Extract the relevant columns for JS divergence calculation
         # Get unique Question_IDs present in both datasets
         common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
+        # Initialize a dictionary to store JS divergence for each experiment
+        js_divergence = {}
         # Calculate JS divergence for each common Question_ID
         for q_id in common_question_ids:
             # Get response distributions for the current Question_ID in both datasets
+            human_dist = human_responses[human_responses['Question_ID'] == q_id]['Coding'].value_counts(
+                normalize=True)
             llm_dist = llm_responses[llm_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
             # Reindex the distributions to have the same index, filling missing values with 0
             human_dist = human_dist.reindex(all_responses, fill_value=0)
             llm_dist = llm_dist.reindex(all_responses, fill_value=0)
+            # Calculate JS divergence
             js_div = jensenshannon(human_dist, llm_dist, base=2)
             experiment_id = q_id.split('_')[1]
             if experiment_id not in js_divergence:
                 js_divergence[experiment_id] = []
             js_divergence[experiment_id].append(js_div)
+        # Calculate the average JS divergence per experiment and the confidence interval
+        results = {}
+        for exp, divs in js_divergence.items():
+            avg_js_divergence = 1 - np.nanmean(divs)
+            ci_lower, ci_upper = bootstrap((divs,), np.nanmean, confidence_level=0.95,
+                                           n_resamples=1000).confidence_interval
+            results[exp] = {
+                'average_js_divergence': avg_js_divergence,
+                'confidence_interval': (1 - ci_upper, 1 - ci_lower)  # Adjust for 1 - score
+            }
+        # Calculate the overall average JS divergence and confidence interval
+        overall_js_divergence = 1 - np.nanmean([js for divs in js_divergence.values() for js in divs])
+        flattened_js_divergence = np.concatenate([np.array(divs) for divs in js_divergence.values()])
+        # 计算总体的置信区间
+        overall_ci_lower, overall_ci_upper = bootstrap(
+            (flattened_js_divergence,),
+            np.nanmean,
+            confidence_level=0.95,
+            n_resamples=1000
+        ).confidence_interval
+        # Combine all results into one dictionary
+        all_results = {
+            'overall': {
+                'average_js_divergence': overall_js_divergence,
+                'confidence_interval': (1 - overall_ci_upper, 1 - overall_ci_lower)
+            },
+            'per_experiment': results
+        }
+        return all_results
+        # ### Calculate Average JS Divergence ###
+        #
+        # # Extract the relevant columns for JS divergence calculation
+        # human_responses = human_df[['Question_ID', 'Coding']]
+        # llm_responses = llm_df[['Question_ID', 'Coding']]
+        #
+        # # Get unique Question_IDs present in both datasets
+        # common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
+        #
+        # # Initialize a list to store JS divergence for each Question_ID
+        # js_divergence_list = []
+        # js_divergence ={}
+        #
+        # # Calculate JS divergence for each common Question_ID
+        # for q_id in common_question_ids:
+        #     # Get response distributions for the current Question_ID in both datasets
+        #     human_dist = human_responses[human_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
+        #     llm_dist = llm_responses[llm_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
+        #
+        #     # Reindex the distributions to have the same index, filling missing values with 0
+        #     all_responses = set(human_dist.index).union(set(llm_dist.index))
+        #     human_dist = human_dist.reindex(all_responses, fill_value=0)
+        #     llm_dist = llm_dist.reindex(all_responses, fill_value=0)
+        #
+        #     # Calculate JS divergence and add to the list
+        #     js_div = jensenshannon(human_dist, llm_dist, base=2)
+        #     experiment_id = q_id.split('_')[1]
+        #     if experiment_id not in js_divergence:
+        #         js_divergence[experiment_id] = []
+        #     js_divergence[experiment_id].append(js_div)
+        #
+        #     js_divergence_list.append(js_div)
+        #     #js_divergence[q_id] = js_div
+        #
+        #
+        #
+        # # Calculate the average JS divergence
+        # # JS per experiment
+        # avg_js_divergence_per_experiment = {exp: 1- np.nanmean(divs) for exp, divs in js_divergence.items()}
+        # print(avg_js_divergence_per_experiment)
+        #
+        # # JS overall
+        # avg_js_divergence = 1 - np.nanmean(js_divergence_list)
+        # print("avg_js_divergence:", avg_js_divergence)
+        #
+        # return avg_js_divergence
     def evaluate_humanlike(self, summaries_df: object, human_data_path: object, result_save_path: object) -> object:
         #     print(f'Save human coding results to {save_path}')
         #     fpath = Path(save_path)
         #     fpath.parent.mkdir(parents=True, exist_ok=True)
+        #     self.data.to_csv(fpath)
         '''coding llm data'''

src/backend/util.py CHANGED Viewed

@@ -35,9 +35,49 @@ def create_pairs(df):
     return pairs
-def format_results(model_name: str, revision: str, precision: str,
-                factual_consistency_rate: float, hallucination_rate: float,
-                answer_rate: float, avg_summary_len: float) -> dict:
     """
     Formats the evaluation results into a structured dictionary.
@@ -45,34 +85,28 @@ def format_results(model_name: str, revision: str, precision: str,
         model_name (str): The name of the evaluated model.
         revision (str): The revision hash of the model.
         precision (str): The precision with which the evaluation was run.
-        factual_consistency_rate (float): The factual consistency rate.
-        hallucination_rate (float): The hallucination rate.
-        answer_rate (float): The answer rate.
-        avg_summary_len (float): The average summary length.
     Returns:
         dict: A dictionary containing the structured evaluation results.
     """
     results = {
         "config": {
-            "model_dtype": precision, # Precision with which you ran the evaluation
-            "model_name": model_name, # Name of the model
-            "model_sha": revision # Hash of the model
         },
         "results": {
-            "hallucination_rate": {
-                "hallucination_rate": round(hallucination_rate,3)
-            },
-            "factual_consistency_rate": {
-                "factual_consistency_rate": round(factual_consistency_rate,1)
-            },
-            "answer_rate": {
-                "answer_rate": round(answer_rate*100,1)
-            },
-            "average_summary_length": {
-                "average_summary_length": round(avg_summary_len,1)
-            },
         }
     }
     return results

     return pairs
+# def format_results(model_name: str, revision: str, precision: str,
+#                 factual_consistency_rate: float, hallucination_rate: float,
+#                 answer_rate: float, avg_summary_len: float) -> dict:
+#     """
+#     Formats the evaluation results into a structured dictionary.
+#
+#     Args:
+#         model_name (str): The name of the evaluated model.
+#         revision (str): The revision hash of the model.
+#         precision (str): The precision with which the evaluation was run.
+#         factual_consistency_rate (float): The factual consistency rate.
+#         hallucination_rate (float): The hallucination rate.
+#         answer_rate (float): The answer rate.
+#         avg_summary_len (float): The average summary length.
+#
+#     Returns:
+#         dict: A dictionary containing the structured evaluation results.
+#     """
+#     results = {
+#         "config": {
+#             "model_dtype": precision, # Precision with which you ran the evaluation
+#             "model_name": model_name, # Name of the model
+#             "model_sha": revision # Hash of the model
+#         },
+#         "results": {
+#             "hallucination_rate": {
+#                 "hallucination_rate": round(hallucination_rate,3)
+#             },
+#             "factual_consistency_rate": {
+#                 "factual_consistency_rate": round(factual_consistency_rate,1)
+#             },
+#             "answer_rate": {
+#                 "answer_rate": round(answer_rate*100,1)
+#             },
+#             "average_summary_length": {
+#                 "average_summary_length": round(avg_summary_len,1)
+#             },
+#         }
+#     }
+#
+#     return results
+def format_results(model_name: str, revision: str, precision: str, overall_js: float, overall_ci: tuple, **experiment_scores) -> dict:
     """
     Formats the evaluation results into a structured dictionary.
         model_name (str): The name of the evaluated model.
         revision (str): The revision hash of the model.
         precision (str): The precision with which the evaluation was run.
+        overall_js (float): The overall average JS divergence.
+        overall_ci (tuple): The confidence interval for the overall JS divergence.
+        experiment_scores: Experiment-specific scores and confidence intervals (E1, E1_ci, E2, E2_ci, ...).
     Returns:
         dict: A dictionary containing the structured evaluation results.
     """
+    # Initialize the base structure
     results = {
         "config": {
+            "model_dtype": precision,  # Precision with which you ran the evaluation
+            "model_name": model_name,  # Name of the model
+            "model_sha": revision      # Hash of the model
         },
         "results": {
+            "overall_js_divergence": overall_js,          # Overall JS divergence
+            "overall_confidence_interval": overall_ci,    # Confidence interval for the overall JS divergence
         }
     }
+    # Add experiment-specific results to the dictionary
+    for exp_name, score in experiment_scores.items():
+        results["results"][exp_name] = score  # Add each experiment score and its CI
     return results

src/display/about.py CHANGED Viewed

@@ -10,8 +10,29 @@ class Task:
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    hallucination_rate = Task("hallucination_rate",
-                            "hallucination_rate", "Humanlike Score (%)")
     # factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
     # answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
     # average_summary_length = Task("average_summary_length",
@@ -23,10 +44,7 @@ TITLE = """<h1 align="center" id="space-title">Humanlike Evaluation Model (HEM)
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-This leaderboard (by [Vectara](https://vectara.com)) evaluates how often an LLM introduces hallucinations when summarizing a document. <br>
-The leaderboard utilizes [HHEM](https://huggingface.co/vectara/hallucination_evaluation_model), an open source hallucination detection model.<br>
-An improved version (HHEM v2) is integrated into the [Vectara platform](https://console.vectara.com/signup/?utm_source=huggingface&utm_medium=space&utm_term=integration&utm_content=console&utm_campaign=huggingface-space-integration-console).
 """
 # Which evaluations are you running? how can people reproduce what you have?
@@ -105,49 +123,24 @@ The results are structured in JSON as follows:
     }
 }
 ```
-For additional queries or model submissions, please contact minseok@vectara.com.
 """
 EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 @dataset{HughesBae2023,
-  author       = {Simon Hughes and Minseok Bae},
-  title        = {Vectara Hallucination Leaderboard},
-  year         = {2023},
-  month        = {11},
-  publisher    = {Vectara, Inc},
   doi          = {},
-  url          = {https://github.com/vectara/hallucination-leaderboard},
-  abstract     = {A leaderboard comparing LLM performance at maintaining factual consistency when summarizing a set of facts.},
-  keywords     = {nlp, llm, hallucination, nli, machine learning},
   license      = {Apache-2.0},
 }"""

 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    Overall = Task("overall_js", "overall_js", "Overall Humanlike %")
+    Overall_ci = Task("overall_ci", "overall_ci", "Overall Humanlike %")
+    E1 = Task("E1", "E1", "E1 Humanlike %")
+    E1_ci = Task("E1", "E1_ci", "E1 CI")
+    E2 = Task("E2", "E2", "E2 Humanlike %")
+    E2_ci = Task("E2", "E2_ci", "E2 CI")
+    E3 = Task("E3", "E3", "E3 Humanlike %")
+    E3_ci = Task("E3", "E3_ci", "E3 CI")
+    E4 = Task("E4", "E4", "E4 Humanlike %")
+    E4_ci = Task("E4", "E4_ci", "E4 CI")
+    E5 = Task("E5", "E5", "E5 Humanlike %")
+    E5_ci = Task("E5", "E5_ci", "E5 CI")
+    E6 = Task("E6", "E6", "E6 Humanlike %")
+    E6_ci = Task("E6", "E6_ci", "E6 CI")
+    E7 = Task("E7", "E7", "E7 Humanlike %")
+    E7_ci = Task("E7", "E7_ci", "E7 CI")
+    E8 = Task("E8", "E8", "E8 Humanlike %")
+    E8_ci = Task("E8", "E8_ci", "E8 CI")
+    E9 = Task("E9", "E9", "E9 Humanlike %")
+    E9_ci = Task("E9", "E9_ci", "E9 CI")
+    E10 = Task("E10", "E10", "E10 Humanlike %")
+    E10_ci = Task("E10", "E10_ci", "E10 CI")
     # factual_consistency_rate = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate (%)")
     # answer_rate = Task("answer_rate", "answer_rate", "Answer Rate (%)")
     # average_summary_length = Task("average_summary_length",
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+This leaderboard (by [Xufeng Duan](https://xufengduan.github.io/)) evaluates the similarities between human and model responses in language use <br>
 """
 # Which evaluations are you running? how can people reproduce what you have?
     }
 }
 ```
+For additional queries or model submissions, please contact xufeng.duan@link.cuhk.edu.hk.
 """
 EVALUATION_QUEUE_TEXT = """
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 @dataset{HughesBae2023,
+  author       = {Xufeng Duan, Bei Xiao, Xuemei Tang, Zhenguang Cai},
+  title        = {Humanlike Leaderboard},
+  year         = {2024},
+  month        = {8},
+  publisher    = {},
   doi          = {},
+  url          = {https://huggingface.co/spaces/Simondon/HumanLikeness},
+  abstract     = {A leaderboard comparing LLM performance at humanlikeness in language use.},
+  keywords     = {nlp, llm, psycholinguistics, nli, machine learning},
   license      = {Apache-2.0},
 }"""

src/envs.py CHANGED Viewed

@@ -5,8 +5,29 @@ from huggingface_hub import HfApi
 # replace this with our token
 # TOKEN = os.environ.get("HF_TOKEN", None)
-TOKEN = os.getenv("H4_TOKEN")
 TOKEN1 = os.getenv("H4_TOKEN1")
 # print("H4_token:", TOKEN)

 # replace this with our token
 # TOKEN = os.environ.get("HF_TOKEN", None)
+TOKEN = os.getenv("H4_TOKEN1")
 TOKEN1 = os.getenv("H4_TOKEN1")
+TOKEN2 = os.getenv("H4_TOKEN2")
+TOKEN3 = os.getenv("H4_TOKEN3")
+TOKEN4 = os.getenv("H4_TOKEN4")
+TOKEN5 = os.getenv("H4_TOKEN5")
+TOKEN6 = os.getenv("H4_TOKEN6")
+TOKEN7 = os.getenv("H4_TOKEN7")
+TOKEN8 = os.getenv("H4_TOKEN8")
+TOKEN9 = os.getenv("H4_TOKEN9")
+TOKEN10 = os.getenv("H4_TOKEN10")
+TOKEN11 = os.getenv("H4_TOKEN11")
+TOKEN12 = os.getenv("H4_TOKEN12")
+TOKEN13 = os.getenv("H4_TOKEN13")
+TOKEN14 = os.getenv("H4_TOKEN14")
+TOKEN15 = os.getenv("H4_TOKEN15")
+TOKEN16 = os.getenv("H4_TOKEN16")
+TOKEN17 = os.getenv("H4_TOKEN17")
+TOKEN18 = os.getenv("H4_TOKEN18")
+TOKEN19 = os.getenv("H4_TOKEN19")
+TOKEN20 = os.getenv("H4_TOKEN20")
 # print("H4_token:", TOKEN)

src/populate.py CHANGED Viewed

@@ -18,11 +18,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     df = pd.DataFrame.from_records(all_data_json)
     print("all results:",df)
     # exit()
-    df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
-    df = df[cols].round(decimals=2)
-    # filter out if any of the benchmarks have not been produced
-    df = df[formatting.has_no_nan_values(df, benchmark_cols)]
     return df

     df = pd.DataFrame.from_records(all_data_json)
     print("all results:",df)
     # exit()
+    try:
+        df = df.sort_values(by=[utils.AutoEvalColumn.hallucination_rate.name], ascending=True)
+        df = df[cols].round(decimals=2)
+        # filter out if any of the benchmarks have not been produced
+        df = df[formatting.has_no_nan_values(df, benchmark_cols)]
+    except:
+        pass
     return df