File size: 3,197 Bytes
f3ecff1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dd0b5c
f3ecff1
1dd0b5c
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
emo_dict = {
    "<|HAPPY|>": "๐Ÿ˜Š",
    "<|SAD|>": "๐Ÿ˜”",
    "<|ANGRY|>": "๐Ÿ˜ก",
    "<|NEUTRAL|>": "",
    "<|FEARFUL|>": "๐Ÿ˜ฐ",
    "<|DISGUSTED|>": "๐Ÿคข",
    "<|SURPRISED|>": "๐Ÿ˜ฎ",
}

event_dict = {
    "<|BGM|>": "๐ŸŽผ",
    "<|Speech|>": "",
    "<|Applause|>": "๐Ÿ‘",
    "<|Laughter|>": "๐Ÿ˜€",
    "<|Cry|>": "๐Ÿ˜ญ",
    "<|Sneeze|>": "๐Ÿคง",
    "<|Breath|>": "",
    "<|Cough|>": "๐Ÿคง",
}

lang_dict = {
    "<|zh|>": "<|lang|>",
    "<|en|>": "<|lang|>",
    "<|yue|>": "<|lang|>",
    "<|ja|>": "<|lang|>",
    "<|ko|>": "<|lang|>",
    "<|nospeech|>": "<|lang|>",
}

emoji_dict = {
    "<|nospeech|><|Event_UNK|>": "โ“",
    "<|zh|>": "",
    "<|en|>": "",
    "<|yue|>": "",
    "<|ja|>": "",
    "<|ko|>": "",
    "<|nospeech|>": "",
    "<|HAPPY|>": "๐Ÿ˜Š",
    "<|SAD|>": "๐Ÿ˜”",
    "<|ANGRY|>": "๐Ÿ˜ก",
    "<|NEUTRAL|>": "",
    "<|BGM|>": "๐ŸŽผ",
    "<|Speech|>": "",
    "<|Applause|>": "๐Ÿ‘",
    "<|Laughter|>": "๐Ÿ˜€",
    "<|FEARFUL|>": "๐Ÿ˜ฐ",
    "<|DISGUSTED|>": "๐Ÿคข",
    "<|SURPRISED|>": "๐Ÿ˜ฎ",
    "<|Cry|>": "๐Ÿ˜ญ",
    "<|EMO_UNKNOWN|>": "",
    "<|Sneeze|>": "๐Ÿคง",
    "<|Breath|>": "",
    "<|Cough|>": "๐Ÿ˜ท",
    "<|Sing|>": "",
    "<|Speech_Noise|>": "",
    "<|withitn|>": "",
    "<|woitn|>": "",
    "<|GBG|>": "",
    "<|Event_UNK|>": "",
}

emo_set = {"๐Ÿ˜Š", "๐Ÿ˜”", "๐Ÿ˜ก", "๐Ÿ˜ฐ", "๐Ÿคข", "๐Ÿ˜ฎ"}
event_set = {
    "๐ŸŽผ",
    "๐Ÿ‘",
    "๐Ÿ˜€",
    "๐Ÿ˜ญ",
    "๐Ÿคง",
    "๐Ÿ˜ท",
}


def format_str_v2(s):
    sptk_dict = {}
    for sptk in emoji_dict:
        sptk_dict[sptk] = s.count(sptk)
        s = s.replace(sptk, "")
    emo = "<|NEUTRAL|>"
    for e in emo_dict:
        if sptk_dict[e] > sptk_dict[emo]:
            emo = e
    for e in event_dict:
        if sptk_dict[e] > 0:
            s = event_dict[e] + s
    s = s + emo_dict[emo]

    for emoji in emo_set.union(event_set):
        s = s.replace(" " + emoji, emoji)
        s = s.replace(emoji + " ", emoji)
    return s.strip()

def rich_transcription_postprocess(s):
    def get_emo(s):
        return s[-1] if s[-1] in emo_set else None

    def get_event(s):
        return s[0] if s[0] in event_set else None

    s = s.replace("<|nospeech|><|Event_UNK|>", "โ“")
    for lang in lang_dict:
        s = s.replace(lang, "<|lang|>")
    s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
    new_s = " " + s_list[0]
    cur_ent_event = get_event(new_s)
    for i in range(1, len(s_list)):
        if len(s_list[i]) == 0:
            continue
        if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
            s_list[i] = s_list[i][1:]
        # else:
        cur_ent_event = get_event(s_list[i])
        if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
            new_s = new_s[:-1]
        new_s += s_list[i].strip().lstrip()
    new_s = new_s.replace("The.", " ")
    return new_s.strip()

def rich_print_asr_res(asr_res, will_print=True, remove_punc=False):
    res = "".join([rich_transcription_postprocess(i) for i in asr_res])

    if remove_punc:
        res = res.replace("๏ผŒ", "")
        res = res.replace("ใ€‚", "")

    if will_print:
        print(res)

    return res