AXERA-TECH
/

SenseVoice

Automatic Speech Recognition

Model card Files Files and versions

SenseVoice / print_utils.py

inoryQwQ's picture

update codes

1dd0b5c 4 months ago

history blame contribute delete

3.2 kB

	emo_dict = {
	"<\|HAPPY\|>": "😊",
	"<\|SAD\|>": "😔",
	"<\|ANGRY\|>": "😡",
	"<\|NEUTRAL\|>": "",
	"<\|FEARFUL\|>": "😰",
	"<\|DISGUSTED\|>": "🤢",
	"<\|SURPRISED\|>": "😮",
	}

	event_dict = {
	"<\|BGM\|>": "🎼",
	"<\|Speech\|>": "",
	"<\|Applause\|>": "👏",
	"<\|Laughter\|>": "😀",
	"<\|Cry\|>": "😭",
	"<\|Sneeze\|>": "🤧",
	"<\|Breath\|>": "",
	"<\|Cough\|>": "🤧",
	}

	lang_dict = {
	"<\|zh\|>": "<\|lang\|>",
	"<\|en\|>": "<\|lang\|>",
	"<\|yue\|>": "<\|lang\|>",
	"<\|ja\|>": "<\|lang\|>",
	"<\|ko\|>": "<\|lang\|>",
	"<\|nospeech\|>": "<\|lang\|>",
	}

	emoji_dict = {
	"<\|nospeech\|><\|Event_UNK\|>": "❓",
	"<\|zh\|>": "",
	"<\|en\|>": "",
	"<\|yue\|>": "",
	"<\|ja\|>": "",
	"<\|ko\|>": "",
	"<\|nospeech\|>": "",
	"<\|HAPPY\|>": "😊",
	"<\|SAD\|>": "😔",
	"<\|ANGRY\|>": "😡",
	"<\|NEUTRAL\|>": "",
	"<\|BGM\|>": "🎼",
	"<\|Speech\|>": "",
	"<\|Applause\|>": "👏",
	"<\|Laughter\|>": "😀",
	"<\|FEARFUL\|>": "😰",
	"<\|DISGUSTED\|>": "🤢",
	"<\|SURPRISED\|>": "😮",
	"<\|Cry\|>": "😭",
	"<\|EMO_UNKNOWN\|>": "",
	"<\|Sneeze\|>": "🤧",
	"<\|Breath\|>": "",
	"<\|Cough\|>": "😷",
	"<\|Sing\|>": "",
	"<\|Speech_Noise\|>": "",
	"<\|withitn\|>": "",
	"<\|woitn\|>": "",
	"<\|GBG\|>": "",
	"<\|Event_UNK\|>": "",
	}

	emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
	event_set = {
	"🎼",
	"👏",
	"😀",
	"😭",
	"🤧",
	"😷",
	}


	def format_str_v2(s):
	sptk_dict = {}
	for sptk in emoji_dict:
	sptk_dict[sptk] = s.count(sptk)
	s = s.replace(sptk, "")
	emo = "<\|NEUTRAL\|>"
	for e in emo_dict:
	if sptk_dict[e] > sptk_dict[emo]:
	emo = e
	for e in event_dict:
	if sptk_dict[e] > 0:
	s = event_dict[e] + s
	s = s + emo_dict[emo]

	for emoji in emo_set.union(event_set):
	s = s.replace(" " + emoji, emoji)
	s = s.replace(emoji + " ", emoji)
	return s.strip()

	def rich_transcription_postprocess(s):
	def get_emo(s):
	return s[-1] if s[-1] in emo_set else None

	def get_event(s):
	return s[0] if s[0] in event_set else None

	s = s.replace("<\|nospeech\|><\|Event_UNK\|>", "❓")
	for lang in lang_dict:
	s = s.replace(lang, "<\|lang\|>")
	s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<\|lang\|>")]
	new_s = " " + s_list[0]
	cur_ent_event = get_event(new_s)
	for i in range(1, len(s_list)):
	if len(s_list[i]) == 0:
	continue
	if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
	s_list[i] = s_list[i][1:]
	# else:
	cur_ent_event = get_event(s_list[i])
	if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
	new_s = new_s[:-1]
	new_s += s_list[i].strip().lstrip()
	new_s = new_s.replace("The.", " ")
	return new_s.strip()

	def rich_print_asr_res(asr_res, will_print=True, remove_punc=False):
	res = "".join([rich_transcription_postprocess(i) for i in asr_res])

	if remove_punc:
	res = res.replace("，", "")
	res = res.replace("。", "")

	if will_print:
	print(res)

	return res