AudioBench-Leaderboard-Extend

Running

App Files Files Community

zhuohan-7 commited on Nov 21, 2024

Commit

4c1d731

verified ·

1 Parent(s): 6d54304

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

app/content.py +1 -1
app/draw_diagram.py +120 -150
app/pages.py +45 -107
app/show_examples.py +23 -58

app/content.py CHANGED Viewed

@@ -62,7 +62,7 @@ cnasr_datasets = {
 }
 metrics = {
-    'wer': 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower the better)',
     'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
     'llama3_70b_judge': 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
     'meteor': 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',

 }
 metrics = {
+    'wer': 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
     'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
     'llama3_70b_judge': 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
     'meteor': 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',

app/draw_diagram.py CHANGED Viewed

@@ -7,12 +7,20 @@ from streamlit.components.v1 import html
 from app.show_examples import *
 import pandas as pd
 # huggingface_image = Image.open('style/huggingface.jpg')
 # other info
-#path = "./AudioBench-Leaderboard/additional_info/Leaderboard-Rename.xlsx"
-path = "./additional_info/Leaderboard-Rename.xlsx"
-info_df = pd.read_excel(path)
 # def nav_to(value):
 #     try:
@@ -26,11 +34,6 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
     folder = f"./results/{metrics}/"
-    display_names = {
-        'SU': 'Speech Understanding',
-        'ASU': 'Audio Scene Understanding',
-        'VU': 'Voice Understanding'
-    }
     data_path = f'{folder}/{category_name.lower()}.csv'
     chart_data = pd.read_csv(data_path).round(3)
@@ -50,8 +53,9 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
                 """, unsafe_allow_html=True)
     # remap model names
-    display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['AudioBench'], info_df['Proper Display Name'])}
-    chart_data['model_show'] = chart_data['Model'].map(display_model_names)
     models = st.multiselect("Please choose the model",
                             sorted(chart_data['model_show'].tolist()),
@@ -61,86 +65,17 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
     chart_data = chart_data[chart_data['model_show'].isin(models)]
     chart_data = chart_data.sort_values(by=[new_dataset_name], ascending=cus_sort).dropna(axis=0)
-    if len(chart_data) == 0:
-        return
-    # Get Values
-    data_values = chart_data.iloc[:, 1]
-    # Calculate Q1 and Q3
-    q1 = data_values.quantile(0.25)
-    q3 = data_values.quantile(0.75)
-    # Calculate IQR
-    iqr = q3 - q1
-    # Define lower and upper bounds (1.5*IQR is a common threshold)
-    lower_bound = q1 - 1.5 * iqr
-    upper_bound = q3 + 1.5 * iqr
-    # Filter data within the bounds
-    filtered_data = data_values[(data_values >= lower_bound) & (data_values <= upper_bound)]
-    # Calculate min and max values after outlier handling
-    min_value = round(filtered_data.min() - 0.1 * filtered_data.min(), 3)
-    max_value = round(filtered_data.max() + 0.1 * filtered_data.max(), 3)
-    options = {
-        #"title": {"text": f"{display_names[folder_name.upper()]}"},
-        "title": {"text": f"{dataset_name}"},
-        "tooltip": {
-            "trigger": "axis",
-            "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
-            "triggerOn": 'mousemove',
-        },
-        "legend": {"data": ['Overall Accuracy']},
-        "toolbox": {"feature": {"saveAsImage": {}}},
-        "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
-        "xAxis": [
-            {
-                "type": "category",
-                "boundaryGap": True,
-                "triggerEvent": True,
-                "data":  chart_data['model_show'].tolist(),
-            }
-        ],
-        "yAxis": [{"type": "value",
-                    "min": min_value,
-                    "max": max_value,
-                    "boundaryGap": True
-                    # "splitNumber": 10
-                    }],
-        "series": [{
-                "name": f"{dataset_name}",
-                "type": "bar",
-                "data": chart_data[f'{new_dataset_name}'].tolist(),
-            }],
-    }
-    events = {
-        "click": "function(params) { return params.value }"
-    }
-    value = st_echarts(options=options, events=events, height="500px")
-    # if value != None:
-    #     # print(value)
-    #     nav_to(value)
-    # if value != None:
-    #     highlight_table_line(value)
     '''
-    Show table
     '''
-    # st.divider()
     with st.container():
-        # st.write("")
-        st.markdown('##### Results')
-        # custom_css = """
-        #             """
-        # st.markdown(custom_css, unsafe_allow_html=True)
         model_link = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
@@ -148,6 +83,9 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
         chart_data_table = chart_data[['model_show', chart_data.columns[1], chart_data.columns[3]]]
         cur_dataset_name = chart_data_table.columns[1]
         if cur_dataset_name in [
@@ -162,7 +100,6 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
                             'tedlium3_long_form_test',
                             'imda_part1_asr_test',
                             'imda_part2_asr_test',
                             'aishell_asr_zh_test',
                             ]:
@@ -187,10 +124,6 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
                     chart_data_table.columns[1]: {'alignment': 'left'},
                     "model_link": st.column_config.LinkColumn(
                         "Model Link",
-                        # # # help="",
-                        # validate=r"^https://(.*?)$",
-                        # # max_chars=100,
-                        # display_text=r"\[(.*?)\]"
                     ),
                 },
                 hide_index=True,
@@ -198,68 +131,105 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
             )
-        # s = ''
-        # for model in models:
-        #     try:
-        #         # <td align="center"><input type="checkbox" name="select"></td>
-        #         s += f"""<tr>
-        #             <td><a href={model_link[model]}>{model}</a></td>
-        #             <td>{chart_data[chart_data['Model'] == model][new_dataset_name].tolist()[0]}</td>
-        #         </tr>"""
-        #     except:
-        #         # print(f"{model} is not in {dataset_name}")
-        #         continue
-        # # select all function
-        # select_all_function = """<script>
-        #     function toggle(source) {
-        #         var checkboxes = document.querySelectorAll('input[type="checkbox"]');
-        #         for (var i = 0; i < checkboxes.length; i++) {
-        #             if (checkboxes[i] != source)
-        #                 checkboxes[i].checked = source.checked;
-        #         }
-        #     }
-        # </script>"""
-        # st.markdown(f"""
-        #             <div class="select_all">{select_all_function}</div>
-        #             """, unsafe_allow_html=True)
-        # info_body_details = f"""
-        #     <table style="width:80%">
-        #         <thead>
-        #             <tr style="text-align: center;">
-        #                 <th style="width:45%">MODEL</th>
-        #                 <th style="width:45%">{dataset_name}</th>
-        #             </tr>
-        #             {s}
-        #         </thead>
-        #     </table>
-        # """
-        # #<th style="width:10%"><input type="checkbox" onclick="toggle(this);"></th>
-        # # html_code = custom_css + select_all_function + info_body_details
-        # # html(html_code, height = 300)
-        # st.markdown(f"""
-        #             <div class="my-data-table">{info_body_details}</div>
-        #             """, unsafe_allow_html=True)
-    # st.dataframe(chart_data,
-    #             #  column_config = {
-    #             #      "Link": st.column_config.LinkColumn(
-    #             #          display_text= st.image(huggingface_image)
-    #             #      ),
-    #             #  },
-    #                 hide_index = True,
-    #                 use_container_width=True)
     '''
-    show samples
     '''
-    if dataset_name in ['Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test']:
-        pass
-    else:
-        show_examples(category_name, dataset_name, chart_data['Model'].tolist(), display_model_names)

 from app.show_examples import *
 import pandas as pd
+from model_information import get_dataframe
 # huggingface_image = Image.open('style/huggingface.jpg')
 # other info
+# path = "./AudioBench-Leaderboard/additional_info/Leaderboard-Rename.xlsx"
+# path = "./additional_info/Leaderboard-Rename.xlsx"
+# info_df = pd.read_excel(path)
+info_df = get_dataframe()
 # def nav_to(value):
 #     try:
     folder = f"./results/{metrics}/"
     data_path = f'{folder}/{category_name.lower()}.csv'
     chart_data = pd.read_csv(data_path).round(3)
                 """, unsafe_allow_html=True)
     # remap model names
+    display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
+    chart_data['model_show'] = chart_data['Model'].map(lambda x: display_model_names.get(x, x))
     models = st.multiselect("Please choose the model",
                             sorted(chart_data['model_show'].tolist()),
     chart_data = chart_data[chart_data['model_show'].isin(models)]
     chart_data = chart_data.sort_values(by=[new_dataset_name], ascending=cus_sort).dropna(axis=0)
+    if len(chart_data) == 0: return
+    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
     '''
+    Show Table
     '''
     with st.container():
+        st.markdown('##### TABLE')
         model_link = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
         chart_data_table = chart_data[['model_show', chart_data.columns[1], chart_data.columns[3]]]
+        # Format numeric columns to 2 decimal places
+        chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: f"{x:.3f}" if isinstance(x, (int, float)) else x)
         cur_dataset_name = chart_data_table.columns[1]
         if cur_dataset_name in [
                             'tedlium3_long_form_test',
                             'imda_part1_asr_test',
                             'imda_part2_asr_test',
                             'aishell_asr_zh_test',
                             ]:
                     chart_data_table.columns[1]: {'alignment': 'left'},
                     "model_link": st.column_config.LinkColumn(
                         "Model Link",
                     ),
                 },
                 hide_index=True,
             )
+    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+    '''
+    Show Chart
+    '''
+    # Initialize a session state variable for toggling the chart visibility
+    if "show_chart" not in st.session_state:
+        st.session_state.show_chart = False
+    # Create a button to toggle visibility
+    if st.button("Show Chart"):
+        st.session_state.show_chart = not st.session_state.show_chart
+    if st.session_state.show_chart:
+        with st.container():
+            st.markdown('##### CHART')
+            # Get Values
+            data_values = chart_data.iloc[:, 1]
+            # Calculate Q1 and Q3
+            q1 = data_values.quantile(0.25)
+            q3 = data_values.quantile(0.75)
+            # Calculate IQR
+            iqr = q3 - q1
+            # Define lower and upper bounds (1.5*IQR is a common threshold)
+            lower_bound = q1 - 1.5 * iqr
+            upper_bound = q3 + 1.5 * iqr
+            # Filter data within the bounds
+            filtered_data = data_values[(data_values >= lower_bound) & (data_values <= upper_bound)]
+            # Calculate min and max values after outlier handling
+            min_value = round(filtered_data.min() - 0.1 * filtered_data.min(), 3)
+            max_value = round(filtered_data.max() + 0.1 * filtered_data.max(), 3)
+            options = {
+                # "title": {"text": f"{dataset_name}"},
+                "tooltip": {
+                    "trigger": "axis",
+                    "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
+                    "triggerOn": 'mousemove',
+                },
+                "legend": {"data": ['Overall Accuracy']},
+                "toolbox": {"feature": {"saveAsImage": {}}},
+                "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
+                "xAxis": [
+                    {
+                        "type": "category",
+                        "boundaryGap": True,
+                        "triggerEvent": True,
+                        "data":  chart_data['model_show'].tolist(),
+                    }
+                ],
+                "yAxis": [{"type": "value",
+                            "min": min_value,
+                            "max": max_value,
+                            "boundaryGap": True
+                            # "splitNumber": 10
+                            }],
+                "series": [{
+                        "name": f"{dataset_name}",
+                        "type": "bar",
+                        "data": chart_data[f'{new_dataset_name}'].tolist(),
+                    }],
+            }
+            events = {
+                "click": "function(params) { return params.value }"
+            }
+            value = st_echarts(options=options, events=events, height="500px")
+    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
     '''
+    Show Examples
     '''
+    # Initialize a session state variable for toggling the chart visibility
+    if "show_examples" not in st.session_state:
+        st.session_state.show_examples = False
+    # Create a button to toggle visibility
+    if st.button("Show Examples"):
+        st.session_state.show_examples = not st.session_state.show_examples
+    if st.session_state.show_examples:
+        # if dataset_name in ['Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test']:
+        if dataset_name in []:
+            pass
+        else:
+            show_examples(category_name, dataset_name, chart_data['Model'].tolist(), display_model_names)

app/pages.py CHANGED Viewed

@@ -9,8 +9,8 @@ def dataset_contents(dataset, metrics):
                 .my-dataset-info {
                 # background-color: #F9EBEA;
                 # padding: 10px;
-                color: #626567;
-                font-style: italic;
                 font-size: 8px;
                 height: auto;
                 }
@@ -18,10 +18,10 @@ def dataset_contents(dataset, metrics):
                 """
     st.markdown(custom_css, unsafe_allow_html=True)
     st.markdown(f"""<div class="my-dataset-info">
-                    <p><b>Dataset Information</b>: {dataset}</p>
                     </div>""", unsafe_allow_html=True)
     st.markdown(f"""<div class="my-dataset-info">
-                    <p><b>Metric Information</b>: {metrics}</p>
                     </div>""", unsafe_allow_html=True)
@@ -38,12 +38,16 @@ def dashboard():
     audio_url = "https://arxiv.org/abs/2406.16020"
     st.divider()
-    st.markdown("#### [AudioBench](%s)" % audio_url)
-    st.markdown("##### :dizzy: A comprehensive evaluation benchmark designed for general instruction-following audiolanguage models")
     st.markdown('''
                 ''')
     with st.container():
@@ -51,7 +55,8 @@ def dashboard():
         with center_co:
             st.image("./style/audio_overview.png",
                      caption="Overview of the datasets in AudioBench.",
-                     use_column_width = True)
         st.markdown('''
@@ -60,21 +65,9 @@ def dashboard():
         st.markdown("###### :dart: Our Benchmark includes: ")
         cols = st.columns(10)
-        cols[1].metric(label="Tasks", value="8") #delta="Tasks", delta_color="off"
-        cols[2].metric(label="Datasets", value="26")
-        cols[3].metric(label="Test Models", value="5")
-        # st.markdown("###### :dart: Supported Models and Datasets: ")
-        # sup = pd.DataFrame(
-        #         {"Dataset": "LibriSpeech-Clean",
-        #          "Category": st.selectbox('category', ['Speech Understanding']),
-        #          "Task": st.selectbox('task', ['Automatic Speech Recognition']),
-        #          "Metrics": st.selectbox('metrics', ['WER']),
-        #          "Status":True}
-        # )
-        # st.data_editor(sup, num_rows="dynamic")
     st.divider()
@@ -92,7 +85,7 @@ def dashboard():
                     ''')
 def asr():
-    st.title("Automatic Speech Recognition")
     filters_levelone = ['LibriSpeech-Test-Clean',
                         'LibriSpeech-Test-Other',
@@ -103,41 +96,23 @@ def asr():
                         'Earnings22-Test',
                         'Tedlium3-Test',
                         'Tedlium3-Long-form-Test',
-                        'IMDA-Part1-ASR-Test',
-                        'IMDA-Part2-ASR-Test']
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
-        filter_1 = st.selectbox('Select Dataset', filters_levelone)
-    # with middle:
-    #     if filter_1 == filters_levelone[0]:
-    #         sort_leveltwo = ['LibriSpeech-Test-Clean', 'LibriSpeech-Test-Other', 'Common-Voice-15-En-Test', 'Peoples-Speech-Test',
-    #                         'GigaSpeech-Test', 'Tedlium3-Test','Tedlium3-Long-form-Test', 'Earning-21-Test', 'Earning-22-Test']
-    #     elif filter_1 == filters_levelone[1]:
-    #         sort_leveltwo = ['CN-College-Listen-Test', 'SLUE-P2-SQA5-Test', 'DREAM-TTS-Test', 'Public-SG-SpeechQA-Test']
-    #     elif filter_1 == filters_levelone[2]:
-    #         sort_leveltwo = ['OpenHermes-Audio-Test', 'ALPACA-Audio-Test']
-    #     sort = st.selectbox("Sort Dataset", sort_leveltwo)
-    # with right:
-    #     sorted = st.selectbox('by', ['Ascending', 'Descending'])
     if filter_1:
         dataset_contents(asr_datsets[filter_1], metrics['wer'])
         draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
-    # else:
-    #     draw('su', 'ASR', 'LibriSpeech-Test-Clean', 'wer')
-    ## examples
 def sqa():
-    st.title("Speech Question Answering")
     binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']
@@ -150,7 +125,7 @@ def sqa():
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
-        filter_1 = st.selectbox('Select Dataset', filters_levelone)
     if filter_1:
         if filter_1 in binary:
@@ -160,11 +135,9 @@ def sqa():
         else:
             dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
             draw('su', 'SQA', filter_1, 'llama3_70b_judge')
-    # else:
-    #     draw('su', 'SQA', 'CN-College-Listen-Test', 'llama3_70b_judge_binary')
 def si():
-    st.title("Speech Question Answering")
     filters_levelone = ['OpenHermes-Audio-Test',
                         'ALPACA-Audio-Test']
@@ -172,16 +145,14 @@ def si():
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
-        filter_1 = st.selectbox('Select Dataset', filters_levelone)
     if filter_1:
         dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
         draw('su', 'SI', filter_1, 'llama3_70b_judge')
-    # else:
-    #     draw('su', 'SI', 'OpenHermes-Audio-Test', 'llama3_70b_judge')
 def ac():
-    st.title("Audio Captioning")
     filters_levelone = ['WavCaps-Test',
                         'AudioCaps-Test']
@@ -190,29 +161,17 @@ def ac():
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
-        filter_1 = st.selectbox('Select Dataset', filters_levelone)
     with middle:
-        metric = st.selectbox('Select Metric', filters_leveltwo)
-    # with middle:
-    #     if filter_1 == filters_levelone[0]:
-    #         sort_leveltwo = ['Clotho-AQA-Test', 'WavCaps-QA-Test', 'AudioCaps-QA-Test']
-    #     elif filter_1 == filters_levelone[1]:
-    #         sort_leveltwo = ['WavCaps-Test', 'AudioCaps-Test']
-    #     sort = st.selectbox("Sort Dataset", sort_leveltwo)
-    # with right:
-    #     sorted = st.selectbox('by', ['Ascending', 'Descending'])
     if filter_1 or metric:
         dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')])
         draw('asu', 'AC',filter_1, metric.lower().replace('-', '_'))
-    # else:
-    #     draw('asu', 'AC', 'WavCaps-Test', 'llama3_70b_judge')
 def asqa():
-    st.title("Audio Scene Question Answering")
     filters_levelone = ['Clotho-AQA-Test',
                         'WavCaps-QA-Test',
@@ -221,57 +180,39 @@ def asqa():
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
-        filter_1 = st.selectbox('Select Dataset', filters_levelone)
     if filter_1:
         dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
         draw('asu', 'AQA',filter_1, 'llama3_70b_judge')
-    # else:
-    #     draw('asu', 'AQA', 'Clotho-AQA-Test', 'llama3_70b_judge')
 def er():
-    st.title("Emotion Recognition")
     filters_levelone = ['IEMOCAP-Emotion-Test',
                         'MELD-Sentiment-Test',
                         'MELD-Emotion-Test']
-    # sort_leveltwo = []
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
-        filter_1 = st.selectbox('Select Dataset', filters_levelone)
-    # with middle:
-    #     if filter_1 == filters_levelone[0]:
-    #         sort_leveltwo = ['IEMOCAP-Emotion-Test', 'MELD-Sentiment-Test', 'MELD-Emotion-Test']
-    #     elif filter_1 == filters_levelone[1]:
-    #         sort_leveltwo = ['VoxCeleb1-Accent-Test']
-    #     elif filter_1 == filters_levelone[2]:
-    #         sort_leveltwo = ['VoxCeleb1-Gender-Test', 'IEMOCAP-Gender-Test']
-    #     sort = st.selectbox("Sort Dataset", sort_leveltwo)
-    # with right:
-    #     sorted = st.selectbox('by', ['Ascending', 'Descending'])
     if filter_1:
         dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge_binary'])
         draw('vu', 'ER', filter_1, 'llama3_70b_judge_binary')
-    # else:
-    #     draw('vu', 'ER', 'IEMOCAP-Emotion-Test', 'llama3_70b_judge_binary')
 def ar():
-    st.title("Accent Recognition")
     filters_levelone = ['VoxCeleb-Accent-Test']
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
-        filter_1 = st.selectbox('Select Dataset', filters_levelone)
     if filter_1:
@@ -280,7 +221,7 @@ def ar():
 def gr():
-    st.title("Gender Recognition")
     filters_levelone = ['VoxCeleb-Gender-Test',
                         'IEMOCAP-Gender-Test']
@@ -288,16 +229,15 @@ def gr():
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
-        filter_1 = st.selectbox('Select Dataset', filters_levelone)
     if filter_1:
         dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge_binary'])
         draw('vu', 'GR', filter_1, 'llama3_70b_judge_binary')
-    # else:
-    #     draw('vu', 'GR', 'VoxCeleb1-Gender-Test', 'llama3_70b_judge_binary')
 def spt():
-    st.title("Speech Translation")
     filters_levelone = ['Covost2-EN-ID-test',
                         'Covost2-EN-ZH-test',
@@ -309,7 +249,7 @@ def spt():
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
-        filter_1 = st.selectbox('Select Dataset', filters_levelone)
     if filter_1:
         dataset_contents(spt_datasets[filter_1], metrics['bleu'])
@@ -318,17 +258,15 @@ def spt():
     #     draw('su', 'ST', 'Covost2-EN-ID-test', 'bleu')
 def cnasr():
-    st.title("Chinese Automatic Speech Recognition")
     filters_levelone = ['Aishell-ASR-ZH-Test']
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
-        filter_1 = st.selectbox('Select Dataset', filters_levelone)
     if filter_1:
         dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
         draw('su', 'CNASR', filter_1, 'wer')
-    # else:
-    #     draw('su', 'CNASR', 'Aishell-ASR-ZH-Test', 'wer')

                 .my-dataset-info {
                 # background-color: #F9EBEA;
                 # padding: 10px;
+                color: #050505;
+                font-style: normal;
                 font-size: 8px;
                 height: auto;
                 }
                 """
     st.markdown(custom_css, unsafe_allow_html=True)
     st.markdown(f"""<div class="my-dataset-info">
+                    <p><b>About this dataset</b>: {dataset}</p>
                     </div>""", unsafe_allow_html=True)
     st.markdown(f"""<div class="my-dataset-info">
+                    <p><b>About this metric</b>: {metrics}</p>
                     </div>""", unsafe_allow_html=True)
     audio_url = "https://arxiv.org/abs/2406.16020"
+    st.markdown("#### News")
+    st.markdown("Dec, 2024: Update layout and support comparison between models with similar model sizes.")
     st.divider()
+    st.markdown("#### What is [AudioBench](%s)?" % audio_url)
+    st.markdown("##### :dizzy: A comprehensive evaluation benchmark designed for general instruction-following audiolanguage models.")
+    st.markdown("##### :dizzy: A evaluation benchmark that we consistently put effort in updating and maintaining.")
     st.markdown('''
                 ''')
     with st.container():
         with center_co:
             st.image("./style/audio_overview.png",
                      caption="Overview of the datasets in AudioBench.",
+                     use_container_width = True
+                     )
         st.markdown('''
         st.markdown("###### :dart: Our Benchmark includes: ")
         cols = st.columns(10)
+        cols[1].metric(label="Tasks", value=">8") #delta="Tasks", delta_color="off"
+        cols[2].metric(label="Datasets", value=">30")
+        cols[3].metric(label="Evaluated Models", value=">5")
     st.divider()
                     ''')
 def asr():
+    st.title("Task: Automatic Speech Recognition")
     filters_levelone = ['LibriSpeech-Test-Clean',
                         'LibriSpeech-Test-Other',
                         'Earnings22-Test',
                         'Tedlium3-Test',
                         'Tedlium3-Long-form-Test',
+                        #'IMDA-Part1-ASR-Test',
+                        #'IMDA-Part2-ASR-Test'
+                        ]
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
     if filter_1:
         dataset_contents(asr_datsets[filter_1], metrics['wer'])
         draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
 def sqa():
+    st.title("Task: Speech Question Answering")
     binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
     if filter_1:
         if filter_1 in binary:
         else:
             dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
             draw('su', 'SQA', filter_1, 'llama3_70b_judge')
 def si():
+    st.title("Task: Speech Instruction")
     filters_levelone = ['OpenHermes-Audio-Test',
                         'ALPACA-Audio-Test']
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
     if filter_1:
         dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
         draw('su', 'SI', filter_1, 'llama3_70b_judge')
 def ac():
+    st.title("Task: Audio Captioning")
     filters_levelone = ['WavCaps-Test',
                         'AudioCaps-Test']
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
     with middle:
+        metric = st.selectbox('Metric', filters_leveltwo)
     if filter_1 or metric:
         dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')])
         draw('asu', 'AC',filter_1, metric.lower().replace('-', '_'))
 def asqa():
+    st.title("Task: Audio Scene Question Answering")
     filters_levelone = ['Clotho-AQA-Test',
                         'WavCaps-QA-Test',
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
     if filter_1:
         dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
         draw('asu', 'AQA',filter_1, 'llama3_70b_judge')
 def er():
+    st.title("Task: Emotion Recognition")
     filters_levelone = ['IEMOCAP-Emotion-Test',
                         'MELD-Sentiment-Test',
                         'MELD-Emotion-Test']
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
     if filter_1:
         dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge_binary'])
         draw('vu', 'ER', filter_1, 'llama3_70b_judge_binary')
 def ar():
+    st.title("Task: Accent Recognition")
     filters_levelone = ['VoxCeleb-Accent-Test']
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
     if filter_1:
 def gr():
+    st.title("Task: Gender Recognition")
     filters_levelone = ['VoxCeleb-Gender-Test',
                         'IEMOCAP-Gender-Test']
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
     if filter_1:
         dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge_binary'])
         draw('vu', 'GR', filter_1, 'llama3_70b_judge_binary')
 def spt():
+    st.title("Task: Speech Translation")
     filters_levelone = ['Covost2-EN-ID-test',
                         'Covost2-EN-ZH-test',
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
     if filter_1:
         dataset_contents(spt_datasets[filter_1], metrics['bleu'])
     #     draw('su', 'ST', 'Covost2-EN-ID-test', 'bleu')
 def cnasr():
+    st.title("Task: Automatic Speech Recognition (Chinese)")
     filters_levelone = ['Aishell-ASR-ZH-Test']
     left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
     if filter_1:
         dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
         draw('su', 'CNASR', filter_1, 'wer')

app/show_examples.py CHANGED Viewed

@@ -2,6 +2,9 @@ import streamlit as st
 import datasets
 import numpy as np
 def show_examples(category_name, dataset_name, model_lists, display_model_names):
     st.divider()
     sample_folder = f"./examples/{category_name}/{dataset_name}"
@@ -16,57 +19,6 @@ def show_examples(category_name, dataset_name, model_lists, display_model_names)
             # with col1:
             st.audio(f'{sample_folder}/sample_{index}.wav', format="audio/wav")
-            # with col2:
-            #     with st.container():
-            #         custom_css = """
-            #                     <style>
-            #                     .my-container-question {
-            #                     background-color: #F5EEF8;
-            #                     padding: 10px;
-            #                     border-radius: 10px;
-            #                     height: auto;
-            #                     }
-            #                     </style>
-            #                     """
-            #         st.markdown(custom_css, unsafe_allow_html=True)
-            #         if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
-            #             choices = dataset[index]['other_attributes']['choices']
-            #             if isinstance(choices, str):
-            #                 choices_text = choices
-            #             elif isinstance(choices, list):
-            #                 choices_text = ' '.join(i for i in choices)
-            #             question_text = f"""<div class="my-container-question">
-            #                                 <p>QUESTION: {dataset[index]['instruction']['text']}</p>
-            #                                 <p>CHOICES: {choices_text}</p>
-            #                                 </div>
-            #                                 """
-            #         else:
-            #             question_text = f"""<div class="my-container-question">
-            #                             <p>QUESTION: {dataset[index]['instruction']['text']}</p>
-            #                             </div>"""
-            #         st.markdown(question_text, unsafe_allow_html=True)
-                # with st.container():
-                #     custom_css = """
-                #                 <style>
-                #                 .my-container-answer {
-                #                 background-color: #F9EBEA;
-                #                 padding: 10px;
-                #                 border-radius: 10px;
-                #                 height: auto;
-                #                 }
-                #                 </style>
-                #                 """
-                #     st.markdown(custom_css, unsafe_allow_html=True)
-                #     st.markdown(f"""<div class="my-container-answer">
-                #                 <p>CORRECT ANSWER: {dataset[index]['answer']['text']}</p>
-                #                 </div>""", unsafe_allow_html=True)
             if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
                 choices = dataset[index]['other_attributes']['choices']
@@ -78,6 +30,8 @@ def show_examples(category_name, dataset_name, model_lists, display_model_names)
                 question_text = f"""{dataset[index]['instruction']['text']} {choices_text}"""
             else:
                 question_text = f"""{dataset[index]['instruction']['text']}"""
             # st.divider()
             with st.container():
@@ -99,33 +53,44 @@ def show_examples(category_name, dataset_name, model_lists, display_model_names)
                 s = f"""<tr>
                        <td><b>REFERENCE</td>
-                       <td><b>{question_text.replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)')}
                        </td>
-                       <td><b>{dataset[index]['answer']['text']}
                        </td>
                 </tr>
                 """
                 if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
                     for model in model_lists:
                         try:
                             s += f"""<tr>
                                 <td>{display_model_names[model]}</td>
                                 <td>
                                     {dataset[index][model]['text'].replace('Choices:', '<br>Choices:').replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)')
                                      }
                                     </td>
-                                <td>{dataset[index][model]['model_prediction']}</td>
                             </tr>"""
                         except:
                             print(f"{model} is not in {dataset_name}")
                             continue
                 else:
                     for model in model_lists:
                         try:
                             s += f"""<tr>
                                 <td>{display_model_names[model]}</td>
-                                <td>{dataset[index][model]['text']}</td>
-                                <td>{dataset[index][model]['model_prediction']}</td>
                             </tr>"""
                         except:
                             print(f"{model} is not in {dataset_name}")
@@ -136,8 +101,8 @@ def show_examples(category_name, dataset_name, model_lists, display_model_names)
                 <thead>
                     <tr style="text-align: center;">
                         <th style="width:20%">MODEL</th>
-                        <th style="width:40%">QUESTION</th>
-                        <th style="width:40%">MODEL PREDICTION</th>
                     </tr>
                 {s}
                 </thead>

 import datasets
 import numpy as np
+import html
 def show_examples(category_name, dataset_name, model_lists, display_model_names):
     st.divider()
     sample_folder = f"./examples/{category_name}/{dataset_name}"
             # with col1:
             st.audio(f'{sample_folder}/sample_{index}.wav', format="audio/wav")
             if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
                 choices = dataset[index]['other_attributes']['choices']
                 question_text = f"""{dataset[index]['instruction']['text']} {choices_text}"""
             else:
                 question_text = f"""{dataset[index]['instruction']['text']}"""
+            question_text = html.escape(question_text)
             # st.divider()
             with st.container():
                 s = f"""<tr>
                        <td><b>REFERENCE</td>
+                       <td><b>{html.escape(question_text.replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)'))}
                        </td>
+                       <td><b>{html.escape(dataset[index]['answer']['text'])}
                        </td>
                 </tr>
                 """
                 if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
                     for model in model_lists:
                         try:
+                            model_prediction = dataset[index][model]['model_prediction']
+                            model_prediction = model_prediction.replace('<','').replace('>','').replace('\n','(newline)').replace('*','')
                             s += f"""<tr>
                                 <td>{display_model_names[model]}</td>
                                 <td>
                                     {dataset[index][model]['text'].replace('Choices:', '<br>Choices:').replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)')
                                      }
                                     </td>
+                                <td>{html.escape(model_prediction)}</td>
                             </tr>"""
                         except:
                             print(f"{model} is not in {dataset_name}")
                             continue
                 else:
                     for model in model_lists:
+                        print(dataset[index][model]['model_prediction'])
                         try:
+                            model_prediction = dataset[index][model]['model_prediction']
+                            model_prediction = model_prediction.replace('<','').replace('>','').replace('\n','(newline)').replace('*','')
                             s += f"""<tr>
                                 <td>{display_model_names[model]}</td>
+                                <td>{html.escape(dataset[index][model]['text'])}</td>
+                                <td>{html.escape(model_prediction)}</td>
                             </tr>"""
                         except:
                             print(f"{model} is not in {dataset_name}")
                 <thead>
                     <tr style="text-align: center;">
                         <th style="width:20%">MODEL</th>
+                        <th style="width:30%">QUESTION</th>
+                        <th style="width:50%">MODEL PREDICTION</th>
                     </tr>
                 {s}
                 </thead>