ms180 commited on
Commit
0ad68fa
·
1 Parent(s): d347363

Add asr model for PER

Browse files
Files changed (1) hide show
  1. evaluation/svs_eval.py +35 -1
evaluation/svs_eval.py CHANGED
@@ -4,9 +4,14 @@ import numpy as np
4
  import torch
5
  import uuid
6
  from pathlib import Path
 
7
 
8
  # ----------- Initialization -----------
9
 
 
 
 
 
10
 
11
  def init_singmos():
12
  print("[Init] Loading SingMOS...")
@@ -72,9 +77,38 @@ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
72
  return np.mean(dissonant) if intervals else np.nan
73
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  def eval_per(audio_path, model=None):
76
  audio_array, sr = librosa.load(audio_path, sr=16000)
77
- # TODO: implement PER evaluation
 
 
 
 
78
  return {}
79
 
80
 
 
4
  import torch
5
  import uuid
6
  from pathlib import Path
7
+ from transformers import pipeline
8
 
9
  # ----------- Initialization -----------
10
 
11
+ asr_pipeline = pipeline(
12
+ "automatic-speech-recognition",
13
+ model="openai/whisper-large-v3-turbo"
14
+ )
15
 
16
  def init_singmos():
17
  print("[Init] Loading SingMOS...")
 
77
  return np.mean(dissonant) if intervals else np.nan
78
 
79
 
80
+ def pypinyin_g2p_phone_without_prosody(text):
81
+ from pypinyin import Style, pinyin
82
+ from pypinyin.style._utils import get_finals, get_initials
83
+
84
+ phones = []
85
+ for phone in pinyin(text, style=Style.NORMAL, strict=False):
86
+ initial = get_initials(phone[0], strict=False)
87
+ final = get_finals(phone[0], strict=False)
88
+ if len(initial) != 0:
89
+ if initial in ["x", "y", "j", "q"]:
90
+ if final == "un":
91
+ final = "vn"
92
+ elif final == "uan":
93
+ final = "van"
94
+ elif final == "u":
95
+ final = "v"
96
+ if final == "ue":
97
+ final = "ve"
98
+ phones.append(initial)
99
+ phones.append(final)
100
+ else:
101
+ phones.append(final)
102
+ return phones
103
+
104
+
105
  def eval_per(audio_path, model=None):
106
  audio_array, sr = librosa.load(audio_path, sr=16000)
107
+ asr_result = asr_pipeline(
108
+ audio_array,
109
+ generate_kwargs={"language": "mandarin"}
110
+ )['text']
111
+ hyp_pinyin = pypinyin_g2p_phone_without_prosody(asr_result)
112
  return {}
113
 
114