zhuohan-7 commited on
Commit
f7a3f90
·
1 Parent(s): 0b7eb99

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. app/content.py +19 -18
  2. app/draw_diagram.py +3 -33
  3. app/pages.py +50 -18
  4. app/show_examples.py +73 -0
app/content.py CHANGED
@@ -61,17 +61,17 @@ displayname2datasetname = {
61
  'Parliament-Short': 'parliament_short_test',
62
  'UKUS-News-Short' : 'ukusnews_short_test',
63
  'Mediacorp-Short' : 'mediacorp_short_test',
64
- 'YTB-ASR-Batch1(English with Singapore Content)': 'ytb_asr_batch1',
65
- 'YTB-ASR-Batch2(English with Strong Emotion)': 'ytb_asr_batch2',
66
- 'YTB-ASR-Batch3-Malay(Malay and Malay-Eng-CodeSwitch)': 'ytb_asr_batch3_ms',
67
- 'YTB-ASR-Batch3-Malay-Malay-Prompte(Malay and Malay-Eng-CodeSwitch)': 'ytb_asr_batch3_ms_ms_prompt',
68
 
69
- 'SEAME-Dev-Man' : 'seame_dev_man',
70
- 'SEAME-Dev-Sge' : 'seame_dev_sge',
71
 
72
- 'YTB-SQA-Batch1(English with Singapore Content)': 'ytb_sqa_batch1',
73
- 'YTB-SDS-Batch1(English with Singapore Content)': 'ytb_sds_batch1',
74
- 'YTB-PQA-Batch1(English with Singapore Content)': 'ytb_pqa_batch1',
75
 
76
  }
77
 
@@ -141,16 +141,17 @@ dataset_diaplay_information = {
141
  'Parliament-Short': 'Under Development',
142
  'UKUS-News-Short' : 'Under Development',
143
  'Mediacorp-Short' : 'Under Development',
144
- 'YTB-ASR-Batch1(English with Singapore Content)' : 'YouTube ASR Dataset, English with Singapore Content',
145
- 'YTB-ASR-Batch2(English with Strong Emotion)' : 'YouTube ASR Dataset, English with strong emotions',
146
- 'YTB-ASR-Batch3-Malay(Malay and Malay-Eng-CodeSwitch)': 'YouTube ASR Dataset, Malay and Malay-English CondeSwitch',
147
- 'YTB-ASR-Batch3-Malay-Malay-Prompte(Malay and Malay-Eng-CodeSwitch)': 'YouTube ASR Dataset, Malay and Malay-English CondeSwitch. Use Malay prompts',
148
- 'SEAME-Dev-Man' : 'Under Development',
149
- 'SEAME-Dev-Sge' : 'Under Development',
 
150
 
151
- 'YTB-SQA-Batch1(English with Singapore Content)': 'Under Development',
152
- 'YTB-SDS-Batch1(English with Singapore Content)': 'Under Development',
153
- 'YTB-PQA-Batch1(English with Singapore Content)': 'Under Development',
154
 
155
 
156
  }
 
61
  'Parliament-Short': 'parliament_short_test',
62
  'UKUS-News-Short' : 'ukusnews_short_test',
63
  'Mediacorp-Short' : 'mediacorp_short_test',
64
+ 'YouTube ASR: English with Singapore Content': 'ytb_asr_batch1',
65
+ 'YouTube ASR: English with Strong Emotion': 'ytb_asr_batch2',
66
+ 'YouTube ASR: Malay with English Prompt': 'ytb_asr_batch3_ms',
67
+ 'YouTube ASR: Malay with Malay Prompt': 'ytb_asr_batch3_ms_ms_prompt',
68
 
69
+ 'SEAME-Dev-Mandarin' : 'seame_dev_man',
70
+ 'SEAME-Dev-Singlish' : 'seame_dev_sge',
71
 
72
+ 'YouTube SQA: English with Singapore Content': 'ytb_sqa_batch1',
73
+ 'YouTube SDS: English with Singapore Content': 'ytb_sds_batch1',
74
+ 'YouTube PQA: English with Singapore Content': 'ytb_pqa_batch1',
75
 
76
  }
77
 
 
141
  'Parliament-Short': 'Under Development',
142
  'UKUS-News-Short' : 'Under Development',
143
  'Mediacorp-Short' : 'Under Development',
144
+ 'YouTube ASR: English Singapore Content' : '''\nYouTube Evaluation Dataset for ASR Task: This dataset include English and Singlish with Singapore Content.''',
145
+ 'YouTube ASR: English with Strong Emotion' : '\nYouTube Evaluation Dataset for ASR Task. English with strong emotions',
146
+ 'YouTube ASR: Malay English Prompt': 'YouTube ASR Dataset, Malay and Malay-English CondeSwitch',
147
+ 'YouTube ASR: Malay with Malay Prompt': 'YouTube ASR Dataset, Malay and Malay-English CondeSwitch. Use Malay prompts',
148
+
149
+ 'SEAME-Dev-Mandarin' : 'Under Development',
150
+ 'SEAME-Dev-Singlish' : 'Under Development',
151
 
152
+ 'YouTube SQA: English with Singapore Content': 'Under Development',
153
+ 'YouTube SDS: English with Singapore Content': 'Under Development',
154
+ 'YouTube PQA: English with Singapore Content': 'Under Development',
155
 
156
 
157
  }
app/draw_diagram.py CHANGED
@@ -25,7 +25,6 @@ def draw(folder_name, category_name, displayname, metrics, cus_sort=True):
25
  # Rename to proper display name
26
  chart_data = chart_data.rename(columns=datasetname2diaplayname)
27
 
28
-
29
  st.markdown("""
30
  <style>
31
  .stMultiSelect [data-baseweb=select] span {
@@ -52,9 +51,8 @@ def draw(folder_name, category_name, displayname, metrics, cus_sort=True):
52
  chart_data = chart_data.sort_values(by=[displayname], ascending=cus_sort).dropna(axis=0)
53
 
54
  if len(chart_data) == 0: return
55
-
56
-
57
-
58
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
59
  '''
60
  Show Table
@@ -222,32 +220,4 @@ def draw(folder_name, category_name, displayname, metrics, cus_sort=True):
222
  }
223
 
224
  value = st_echarts(options=options, events=events, height="500px")
225
-
226
-
227
-
228
-
229
- # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
230
-
231
- '''
232
- Show Examples
233
- '''
234
-
235
-
236
- # Initialize a session state variable for toggling the chart visibility
237
- if "show_examples" not in st.session_state:
238
- st.session_state.show_examples = False
239
-
240
- # Create a button to toggle visibility
241
- if st.button("Show Examples"):
242
- st.session_state.show_examples = not st.session_state.show_examples
243
-
244
- if st.session_state.show_examples:
245
-
246
- st.markdown('To be implemented')
247
-
248
- # # if dataset_name in ['Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test']:
249
- # if dataset_name in []:
250
- # pass
251
- # else:
252
- # show_examples(category_name, dataset_name, chart_data['Model'].tolist(), display_model_names)
253
-
 
25
  # Rename to proper display name
26
  chart_data = chart_data.rename(columns=datasetname2diaplayname)
27
 
 
28
  st.markdown("""
29
  <style>
30
  .stMultiSelect [data-baseweb=select] span {
 
51
  chart_data = chart_data.sort_values(by=[displayname], ascending=cus_sort).dropna(axis=0)
52
 
53
  if len(chart_data) == 0: return
54
+
55
+
 
56
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
57
  '''
58
  Show Table
 
220
  }
221
 
222
  value = st_echarts(options=options, events=events, height="500px")
223
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/pages.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  from app.draw_diagram import *
3
  from app.content import *
4
  from app.summarization import *
 
5
 
6
  def dataset_contents(dataset, metrics):
7
 
@@ -539,16 +540,17 @@ def under_development():
539
  'UKUS-News-Short',
540
  'Mediacorp-Short',
541
 
542
- 'YTB-ASR-Batch1(English with Singapore Content)',
543
- 'YTB-ASR-Batch2(English with Strong Emotion)',
544
- 'YTB-ASR-Batch3-Malay(Malay and Malay-Eng-CodeSwitch)',
545
- 'YTB-ASR-Batch3-Malay-Malay-Prompte(Malay and Malay-Eng-CodeSwitch)',
546
- 'SEAME-Dev-Man',
547
- 'SEAME-Dev-Sge',
 
548
 
549
- 'YTB-SQA-Batch1(English with Singapore Content)',
550
- 'YTB-SDS-Batch1(English with Singapore Content)',
551
- 'YTB-PQA-Batch1(English with Singapore Content)',
552
 
553
  ]
554
 
@@ -561,6 +563,34 @@ def under_development():
561
 
562
  dataset_contents(dataset_diaplay_information[filter_1], 'under_development')
563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  if filter_1 in [
565
  'CNA',
566
  'IDPC',
@@ -571,20 +601,22 @@ def under_development():
571
  'Parliament-Short',
572
  'UKUS-News-Short',
573
  'Mediacorp-Short',
574
- 'YTB-ASR-Batch1(English with Singapore Content)',
575
- 'YTB-ASR-Batch2(English with Strong Emotion)',
576
- 'YTB-ASR-Batch3-Malay(Malay and Malay-Eng-CodeSwitch)',
577
- 'YTB-ASR-Batch3-Malay-Malay-Prompte(Malay and Malay-Eng-CodeSwitch)',
578
- 'SEAME-Dev-Man',
579
- 'SEAME-Dev-Sge',
 
 
580
  ]:
581
 
582
  draw('vu', 'under_development_wer', filter_1, 'wer')
583
 
584
  elif filter_1 in [
585
- 'YTB-SQA-Batch1(English with Singapore Content)',
586
- 'YTB-SDS-Batch1(English with Singapore Content)',
587
- 'YTB-PQA-Batch1(English with Singapore Content)',
588
  ]:
589
  draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
590
 
 
2
  from app.draw_diagram import *
3
  from app.content import *
4
  from app.summarization import *
5
+ from app.show_examples import *
6
 
7
  def dataset_contents(dataset, metrics):
8
 
 
540
  'UKUS-News-Short',
541
  'Mediacorp-Short',
542
 
543
+ 'YouTube ASR: English Singapore Content',
544
+ 'YouTube ASR: English with Strong Emotion',
545
+ 'YouTube ASR: Malay English Prompt',
546
+ 'YouTube ASR: Malay with Malay Prompt',
547
+
548
+ 'SEAME-Dev-Mandarin',
549
+ 'SEAME-Dev-Singlish',
550
 
551
+ 'YouTube SQA: English with Singapore Content',
552
+ 'YouTube SDS: English with Singapore Content',
553
+ 'YouTube PQA: English with Singapore Content',
554
 
555
  ]
556
 
 
563
 
564
  dataset_contents(dataset_diaplay_information[filter_1], 'under_development')
565
 
566
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
567
+
568
+ '''
569
+ Show Dataset Examples
570
+ '''
571
+
572
+ # Initialize a session state variable for toggling the chart visibility
573
+ if "show_dataset_examples" not in st.session_state:
574
+ st.session_state.show_dataset_examples = False
575
+
576
+ # Create a button to toggle visibility
577
+ if st.button("Show Dataset Examples"):
578
+ st.session_state.show_dataset_examples = not st.session_state.show_dataset_examples
579
+
580
+ if st.session_state.show_dataset_examples:
581
+
582
+ # st.markdown('To be implemented')
583
+
584
+ # # if dataset_name in ['Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test']:
585
+ if filter_1 in []:
586
+ pass
587
+ else:
588
+ try:
589
+ show_dataset_examples(filter_1)
590
+ except:
591
+ st.markdown('To be implemented')
592
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
593
+
594
  if filter_1 in [
595
  'CNA',
596
  'IDPC',
 
601
  'Parliament-Short',
602
  'UKUS-News-Short',
603
  'Mediacorp-Short',
604
+
605
+ 'YouTube ASR: English Singapore Content',
606
+ 'YouTube ASR: English with Strong Emotion',
607
+ 'YouTube ASR: Malay English Prompt',
608
+ 'YouTube ASR: Malay with Malay Prompt',
609
+
610
+ 'SEAME-Dev-Mandarin',
611
+ 'SEAME-Dev-Singlish',
612
  ]:
613
 
614
  draw('vu', 'under_development_wer', filter_1, 'wer')
615
 
616
  elif filter_1 in [
617
+ 'YouTube SQA: English with Singapore Content',
618
+ 'YouTube SDS: English with Singapore Content',
619
+ 'YouTube PQA: English with Singapore Content',
620
  ]:
621
  draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
622
 
app/show_examples.py CHANGED
@@ -4,6 +4,79 @@ import numpy as np
4
 
5
  import html
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def show_examples(category_name, dataset_name, model_lists, display_model_names):
9
  st.divider()
 
4
 
5
  import html
6
 
7
+ from app.content import displayname2datasetname
8
+
9
+ def show_dataset_examples(display_name):
10
+ st.divider()
11
+ dataset_name = displayname2datasetname[display_name]
12
+ sample_folder = f"./examples/{dataset_name}"
13
+
14
+ # load dataset
15
+ dataset = datasets.load_from_disk(sample_folder)
16
+
17
+ for index in range(len(dataset)):
18
+ with st.container():
19
+ st.markdown(f'##### Example-{index+1}')
20
+ col1, col2 = st.columns([0.3, 0.7], vertical_alignment="center")
21
+
22
+ # with col1:
23
+ st.audio(f'{sample_folder}/sample_{index}.wav', format="audio/wav")
24
+
25
+ if dataset_name in ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']:
26
+
27
+ choices = dataset[index]['other_attributes']['choices']
28
+ if isinstance(choices, str):
29
+ choices_text = choices
30
+ elif isinstance(choices, list):
31
+ choices_text = ' '.join(i for i in choices)
32
+
33
+ question_text = f"""{dataset[index]['instruction']['text']} {choices_text}"""
34
+ else:
35
+ question_text = f"""{dataset[index]['instruction']['text']}"""
36
+
37
+ question_text = html.escape(question_text)
38
+
39
+ with st.container():
40
+ custom_css = """
41
+ <style>
42
+ .my-container-table, p.my-container-text {
43
+ background-color: #fcf8dc;
44
+ padding: 10px;
45
+ border-radius: 5px;
46
+ font-size: 13px;
47
+ # height: 50px;
48
+ word-wrap: break-word
49
+ }
50
+ </style>
51
+ """
52
+ st.markdown(custom_css, unsafe_allow_html=True)
53
+
54
+ s = f"""<tr>
55
+ <td><b>{html.escape(question_text.replace('(A)', '<br>(A)').replace('(B)', '<br>(B)').replace('(C)', '<br>(C)'))}
56
+ </td>
57
+ <td><b>{html.escape(dataset[index]['answer']['text'])}
58
+ </td>
59
+ </tr>
60
+ """
61
+
62
+ body_details = f"""<table style="table-layout: fixed; width:100%">
63
+ <thead>
64
+ <tr style="text-align: center;">
65
+ <th style="width:50%">PROMPT</th>
66
+ <th style="width:50%">ANSWER</th>
67
+ </tr>
68
+ {s}
69
+ </thead>
70
+ </table>"""
71
+
72
+ st.markdown(f"""<div class="my-container-table">
73
+ {body_details}
74
+ </div>""", unsafe_allow_html=True)
75
+
76
+ st.text("")
77
+
78
+ st.divider()
79
+
80
 
81
  def show_examples(category_name, dataset_name, model_lists, display_model_names):
82
  st.divider()