HumanLikeness

Sleeping

App Files Files Community

XufengDuan commited on Aug 23, 2024

Commit

ec7c10d

1 Parent(s): c150b24

update scripts

Browse files

Files changed (2) hide show

app.py +188 -245
src/backend/model_operations.py +36 -10

app.py CHANGED Viewed

@@ -51,41 +51,7 @@ original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_d
 leaderboard_df = original_df.copy()
 def process_pending_evals():
-    # if len(pending_eval_queue_df) == 0:
-    #     print("No pending evaluations found.")
-    #     return
-    #
-    # for _, eval_request in pending_eval_queue_df.iterrows():
-    #     import re
-    #     model_link = eval_request['model']
-    #     match = re.search(r'>([^<]+)<', model_link)
-    #     if match:
-    #         eval_request['model'] = match.group(1)  # 赋值给 eval_request['model']
-    #     else:
-    #         eval_request['model'] = model_link  # 如果无法匹配，保留原始字符串
-    #
-    #     print(f"Evaluating model: {eval_request['model']}")
-    #
-    #     # 调用评估函数
-    #     run_eval_suite.run_evaluation(
-    #         eval_request=eval_request,
-    #         local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
-    #         results_repo=envs.RESULTS_REPO,
-    #         batch_size=1,
-    #         device=envs.DEVICE,
-    #         no_cache=True,
-    #         need_check=False,  # 根据需要设定是否需要检查
-    #         write_results=False  # 根据需要设定是否写入结果
-    #     )
-    #     print(f"Finished evaluation for model: {eval_request['model']}")
-    #     # Update the status to FINISHED
-    #     manage_requests.set_eval_request(
-    #         api=envs.API,
-    #         eval_request=eval_request,
-    #         new_status="FINISHED",
-    #         hf_repo=envs.QUEUE_REPO,
-    #         local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
-    #     )
     current_pending_status = [PENDING_STATUS]
     print('_________________')
     manage_requests.check_completed_evals(
@@ -246,103 +212,88 @@ def filter_models(
     return filtered_df
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(about.TITLE)
-    gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            with gr.Row():
-                with gr.Column():
-                    with gr.Row():
-                        search_bar = gr.Textbox(
-                            placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
-                            show_label=False,
-                            elem_id="search-bar",
-                        )
-                    with gr.Row():
-                        shown_columns = gr.CheckboxGroup(
-                            choices=[
-                                c.name
-                                for c in utils.fields(utils.AutoEvalColumn)
-                                if not c.hidden and not c.never_hidden and not c.dummy
-                            ],
-                            value=[
-                                c.name
-                                for c in utils.fields(utils.AutoEvalColumn)
-                                if c.displayed_by_default and not c.hidden and not c.never_hidden
-                            ],
-                            label="Select columns to show",
-                            elem_id="column-select",
                             interactive=True,
                         )
-                    with gr.Row():
-                        deleted_models_visibility = gr.Checkbox(
-                            value=False, label="Show gated/private/deleted models", interactive=True
                         )
-                with gr.Column(min_width=320):
-                    #with gr.Box(elem_id="box-filter"):
-                    # filter_columns_type = gr.CheckboxGroup(
-                    #     label="Model types",
-                    #     choices=[t.to_str() for t in utils.ModelType],
-                    #     value=[t.to_str() for t in utils.ModelType],
-                    #     interactive=True,
-                    #     elem_id="filter-columns-type",
-                    # )
-                    filter_columns_precision = gr.CheckboxGroup(
-                        label="Precision",
-                        choices=[i.value.name for i in utils.Precision],
-                        value=[i.value.name for i in utils.Precision],
-                        interactive=True,
-                        elem_id="filter-columns-precision",
-                    )
-                    filter_columns_size = gr.CheckboxGroup(
-                        label="Model sizes (in billions of parameters)",
-                        choices=list(utils.NUMERIC_INTERVALS.keys()),
-                        value=list(utils.NUMERIC_INTERVALS.keys()),
-                        interactive=True,
-                        elem_id="filter-columns-size",
-                    )
-            leaderboard_table = gr.components.Dataframe(
-                value=leaderboard_df[
-                    [c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden]
-                    + shown_columns.value
-                    + [utils.AutoEvalColumn.dummy.name]
-                ].sort_values(by="Overall Humanlike %", ascending=False),
-                headers=[c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden] + shown_columns.value,
-                datatype=utils.TYPES,
-                elem_id="leaderboard-table",
-                interactive=False,
-                visible=True,
-                column_widths=["33%", "33%"]
-            )
-            # Dummy leaderboard for handling the case when the user uses backspace key
-            hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=original_df[utils.COLS],
-                headers=utils.COLS,
-                datatype=utils.TYPES,
-                visible=False,
-            )
-            search_bar.submit(
-                update_table,
-                [
-                    hidden_leaderboard_table_for_search,
-                    shown_columns,
-                    #filter_columns_type,
-                    filter_columns_precision,
-                    filter_columns_size,
-                    deleted_models_visibility,
-                    search_bar,
-                ],
-                leaderboard_table,
-            )
-            # for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
-            for selector in [shown_columns, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
-                selector.change(
                     update_table,
                     [
                         hidden_leaderboard_table_for_search,
@@ -354,133 +305,125 @@ with demo:
                         search_bar,
                     ],
                     leaderboard_table,
-                    queue=True,
                 )
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(about.EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=utils.EVAL_COLS,
-                                datatype=utils.EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=utils.EVAL_COLS,
-                                datatype=utils.EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=utils.EVAL_COLS,
-                                datatype=utils.EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in utils.ModelType if t != utils.ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in utils.Precision if i != utils.Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
                     )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in utils.WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                submit.add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
-    with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=about.CITATION_BUTTON_TEXT,
-                label=about.CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-# 在初始化完成后调用
-# original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
-# process_pending_evals()
-# try:
-#     print(envs.EVAL_REQUESTS_PATH)
-#     snapshot_download(
-#         repo_id=envs.QUEUE_REPO, local_dir=envs.EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
-#     )
-# except Exception:
-#     restart_space()
-# try:
-#     print(envs.EVAL_RESULTS_PATH)
-#     snapshot_download(
-#         repo_id=envs.RESULTS_REPO, local_dir=envs.EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
-#     )
-# except Exception:
-#     restart_space()
-# raw_data, original_df = populate.get_leaderboard_df(envs.RESULTS_REPO, envs.QUEUE_REPO, utils.COLS, utils.BENCHMARK_COLS)
 (
     finished_eval_queue_df,

 leaderboard_df = original_df.copy()
 def process_pending_evals():
     current_pending_status = [PENDING_STATUS]
     print('_________________')
     manage_requests.check_completed_evals(
     return filtered_df
+try:
+    demo = gr.Blocks(css=custom_css)
+    with demo:
+        gr.HTML(about.TITLE)
+        gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
+        with gr.Tabs(elem_classes="tab-buttons") as tabs:
+            with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Row():
+                            search_bar = gr.Textbox(
+                                placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
+                                show_label=False,
+                                elem_id="search-bar",
+                            )
+                        with gr.Row():
+                            shown_columns = gr.CheckboxGroup(
+                                choices=[
+                                    c.name
+                                    for c in utils.fields(utils.AutoEvalColumn)
+                                    if not c.hidden and not c.never_hidden and not c.dummy
+                                ],
+                                value=[
+                                    c.name
+                                    for c in utils.fields(utils.AutoEvalColumn)
+                                    if c.displayed_by_default and not c.hidden and not c.never_hidden
+                                ],
+                                label="Select columns to show",
+                                elem_id="column-select",
+                                interactive=True,
+                            )
+                        with gr.Row():
+                            deleted_models_visibility = gr.Checkbox(
+                                value=False, label="Show gated/private/deleted models", interactive=True
+                            )
+                    with gr.Column(min_width=320):
+                        #with gr.Box(elem_id="box-filter"):
+                        # filter_columns_type = gr.CheckboxGroup(
+                        #     label="Model types",
+                        #     choices=[t.to_str() for t in utils.ModelType],
+                        #     value=[t.to_str() for t in utils.ModelType],
+                        #     interactive=True,
+                        #     elem_id="filter-columns-type",
+                        # )
+                        filter_columns_precision = gr.CheckboxGroup(
+                            label="Precision",
+                            choices=[i.value.name for i in utils.Precision],
+                            value=[i.value.name for i in utils.Precision],
                             interactive=True,
+                            elem_id="filter-columns-precision",
                         )
+                        filter_columns_size = gr.CheckboxGroup(
+                            label="Model sizes (in billions of parameters)",
+                            choices=list(utils.NUMERIC_INTERVALS.keys()),
+                            value=list(utils.NUMERIC_INTERVALS.keys()),
+                            interactive=True,
+                            elem_id="filter-columns-size",
                         )
+                leaderboard_table = gr.components.Dataframe(
+                    value=leaderboard_df[
+                        [c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden]
+                        + shown_columns.value
+                        + [utils.AutoEvalColumn.dummy.name]
+                    ].sort_values(by="Overall Humanlike %", ascending=False),
+                    headers=[c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden] + shown_columns.value,
+                    datatype=utils.TYPES,
+                    elem_id="leaderboard-table",
+                    interactive=False,
+                    visible=True,
+                    column_widths=["33%", "33%"]
+                )
+                # Dummy leaderboard for handling the case when the user uses backspace key
+                hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                    value=original_df[utils.COLS],
+                    headers=utils.COLS,
+                    datatype=utils.TYPES,
+                    visible=False,
+                )
+                search_bar.submit(
                     update_table,
                     [
                         hidden_leaderboard_table_for_search,
                         search_bar,
                     ],
                     leaderboard_table,
                 )
+                # for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
+                for selector in [shown_columns, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
+                    selector.change(
+                        update_table,
+                        [
+                            hidden_leaderboard_table_for_search,
+                            shown_columns,
+                            #filter_columns_type,
+                            filter_columns_precision,
+                            filter_columns_size,
+                            deleted_models_visibility,
+                            search_bar,
+                        ],
+                        leaderboard_table,
+                        queue=True,
                     )
+            with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+                gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+                with gr.Column():
+                    with gr.Row():
+                        gr.Markdown(about.EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                    with gr.Column():
+                        with gr.Accordion(
+                            f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                            open=False,
+                        ):
+                            with gr.Row():
+                                finished_eval_table = gr.components.Dataframe(
+                                    value=finished_eval_queue_df,
+                                    headers=utils.EVAL_COLS,
+                                    datatype=utils.EVAL_TYPES,
+                                    row_count=5,
+                                )
+                        with gr.Accordion(
+                            f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                            open=False,
+                        ):
+                            with gr.Row():
+                                running_eval_table = gr.components.Dataframe(
+                                    value=running_eval_queue_df,
+                                    headers=utils.EVAL_COLS,
+                                    datatype=utils.EVAL_TYPES,
+                                    row_count=5,
+                                )
+                        with gr.Accordion(
+                            f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                            open=False,
+                        ):
+                            with gr.Row():
+                                pending_eval_table = gr.components.Dataframe(
+                                    value=pending_eval_queue_df,
+                                    headers=utils.EVAL_COLS,
+                                    datatype=utils.EVAL_TYPES,
+                                    row_count=5,
+                                )
+                with gr.Row():
+                    gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+                with gr.Row():
+                    with gr.Column():
+                        model_name_textbox = gr.Textbox(label="Model name")
+                        revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                        model_type = gr.Dropdown(
+                            choices=[t.to_str(" : ") for t in utils.ModelType if t != utils.ModelType.Unknown],
+                            label="Model type",
+                            multiselect=False,
+                            value=None,
+                            interactive=True,
+                        )
+                    with gr.Column():
+                        precision = gr.Dropdown(
+                            choices=[i.value.name for i in utils.Precision if i != utils.Precision.Unknown],
+                            label="Precision",
+                            multiselect=False,
+                            value="float16",
+                            interactive=True,
+                        )
+                        weight_type = gr.Dropdown(
+                            choices=[i.value.name for i in utils.WeightType],
+                            label="Weights type",
+                            multiselect=False,
+                            value="Original",
+                            interactive=True,
+                        )
+                        base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+                submit_button = gr.Button("Submit Eval")
+                submission_result = gr.Markdown()
+                submit_button.click(
+                    submit.add_new_eval,
+                    [
+                        model_name_textbox,
+                        base_model_name_textbox,
+                        revision_name_textbox,
+                        precision,
+                        weight_type,
+                        model_type,
+                    ],
+                    submission_result,
+                )
+        with gr.Row():
+            with gr.Accordion("📙 Citation", open=False):
+                citation_button = gr.Textbox(
+                    value=about.CITATION_BUTTON_TEXT,
+                    label=about.CITATION_BUTTON_LABEL,
+                    lines=20,
+                    elem_id="citation-button",
+                    show_copy_button=True,
+                )
+except Exception as e:
+    print(e)
 (
     finished_eval_queue_df,

src/backend/model_operations.py CHANGED Viewed

@@ -35,7 +35,7 @@ import spacy_transformers
 import subprocess
 # Run the command to download the spaCy model
-subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
 # subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
 # subprocess.run(["pip", "install", "spacy-transformers"], check=True)
 # subprocess.run(["pip", "install", "curated-transformers"], check=True)
@@ -45,7 +45,7 @@ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=Tr
 try:
     nlp1 = spacy.load("en_core_web_lg")
 except OSError:
-    print("无法加载模型，继续执行其他处理。")
 # litellm.set_verbose=False
 litellm.set_verbose=True
@@ -537,6 +537,7 @@ class EvaluationModel:
         female_keyword = ["she", "her", "herself"]
         #print(len(responses_df["Experiment"]))
         for i in range(len(responses_df["Experiment"])):
             print(i, "/", len(responses_df["Experiment"]))
             # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
             # print()
@@ -592,7 +593,6 @@ class EvaluationModel:
                     output.append("Other")
                 else:
                     words = rs.split()  # split the response into words
-                    output = []
                     if any(word == word1 for word in words) and any(word == word2 for word in words):
                         output.append("Other")
                     else:
@@ -607,12 +607,41 @@ class EvaluationModel:
                             else:
                                 output.append("Long")
                         else:
-                            output.append("Other")
                 '''Exp4'''
             elif responses_df["Experiment"][i] == "E4":
-                filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for r in rs.split("\n")]
                 filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
                 rs = "\n".join(filtered_lines)
@@ -803,11 +832,8 @@ class EvaluationModel:
                 output.append("NA")
             # print(output)
         # exit()
-        '''human'''
-        # self.data = pd.DataFrame(list(zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"],  responses_df["Response"], responses_df["Factor 2"], responses_df["Stimuli 1"], responses_df["Coding"], output)),
-        #                                     columns=["Experiment", "Question_ID", "Item",  "Response", "Factor 2", "Simulate 1","Original_Coding","Coding"])
-        '''LLM'''
-        # print(len(output))
         self.data = pd.DataFrame(list(
             zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],
                 responses_df["Factor 2"], responses_df["Stimuli 1"], output)),

 import subprocess
 # Run the command to download the spaCy model
+# subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
 # subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
 # subprocess.run(["pip", "install", "spacy-transformers"], check=True)
 # subprocess.run(["pip", "install", "curated-transformers"], check=True)
 try:
     nlp1 = spacy.load("en_core_web_lg")
 except OSError:
+    print("Can not load spacy model")
 # litellm.set_verbose=False
 litellm.set_verbose=True
         female_keyword = ["she", "her", "herself"]
         #print(len(responses_df["Experiment"]))
         for i in range(len(responses_df["Experiment"])):
             print(i, "/", len(responses_df["Experiment"]))
             # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
             # print()
                     output.append("Other")
                 else:
                     words = rs.split()  # split the response into words
                     if any(word == word1 for word in words) and any(word == word2 for word in words):
                         output.append("Other")
                     else:
                             else:
                                 output.append("Long")
                         else:
+                            if len(words) > 1:
+                                # joint the words using " "
+                                word = " ".join(words)
+                                if word.lower() == word1.lower():
+                                    if len(word1) > len(word2):
+                                        output.append("Long")
+                                    else:
+                                        output.append("Short")
+                                elif word.lower() == word2.lower():
+                                    if len(word1) > len(word2):
+                                        output.append("Short")
+                                    else:
+                                        output.append("Long")
+                                else:
+                                    output.append("Other")
+                            else:
+                                output.append("Other")
                 '''Exp4'''
             elif responses_df["Experiment"][i] == "E4":
+                lines = rs.split("\n")
+                filtered_lines = []
+                if len(lines) > 1:
+                    for r in lines[1:]:
+                        if ':' in r:
+                            filtered_lines.append(r.split(':', 1)[-1].strip())
+                        else:
+                            filtered_lines.append(r)
+                    filtered_lines.insert(0, lines[0])
+                else:
+                    filtered_lines = lines
+                print(filtered_lines)
                 filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
                 rs = "\n".join(filtered_lines)
                 output.append("NA")
             # print(output)
         # exit()
+                '''LLM'''
+        print(len(output))
         self.data = pd.DataFrame(list(
             zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],
                 responses_df["Factor 2"], responses_df["Stimuli 1"], output)),