jhansss commited on
Commit
ca4acf3
·
1 Parent(s): 67367de

Add integrating random_select to server.py

Browse files
Files changed (1) hide show
  1. server.py +30 -9
server.py CHANGED
@@ -14,7 +14,7 @@ import soundfile as sf
14
  from pypinyin import lazy_pinyin
15
  import jiwer
16
  import librosa
17
- from svs_utils import singmos_warmup, singmos_evaluation
18
 
19
  app = FastAPI()
20
 
@@ -33,11 +33,12 @@ SYSTEM_PROMPT = """
33
  - 啊呀,那是我未曾涉足的奇技,恕我無法詳答
34
  - 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了
35
  請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。
36
-
37
  有人曾這樣對麗梅說話——{}
38
  麗梅的回答——
39
  """
40
 
 
41
  config = argparse.Namespace(
42
  model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
43
  cache_dir="cache",
@@ -51,6 +52,9 @@ svs_model = svs_warmup(config)
51
  predictor, _ = singmos_warmup()
52
  sample_rate = 44100
53
 
 
 
 
54
  def remove_non_chinese_japanese(text):
55
  pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+'
56
  cleaned = re.sub(pattern, '', text)
@@ -69,6 +73,24 @@ def remove_punctuation_and_replace_with_space(text):
69
  return text
70
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  @app.post("/process_audio")
73
  async def process_audio(file: UploadFile = File(...)):
74
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
@@ -78,7 +100,8 @@ async def process_audio(file: UploadFile = File(...)):
78
  # load audio
79
  y = librosa.load(tmp_path, sr=16000)[0]
80
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
81
- prompt = SYSTEM_PROMPT.format(asr_result)
 
82
  output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
83
  output = output.split("麗梅的回答——")[1]
84
  output = remove_punctuation_and_replace_with_space(output)
@@ -89,6 +112,7 @@ async def process_audio(file: UploadFile = File(...)):
89
  output,
90
  svs_model,
91
  config,
 
92
  )
93
  sf.write("tmp/response.wav", wav_info, samplerate=44100)
94
 
@@ -132,7 +156,7 @@ def test_audio():
132
  # load audio
133
  y = librosa.load("nihao.mp3", sr=16000)[0]
134
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
135
- prompt = SYSTEM_PROMPT + asr_result
136
  output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
137
  output = output.split("麗梅的回答——")[1]
138
  output = remove_punctuation_and_replace_with_space(output)
@@ -140,12 +164,9 @@ def test_audio():
140
  f.write(output)
141
 
142
  wav_info = svs_inference(
143
- config.model_path,
144
- svs_model,
145
  output,
146
- lang=config.lang,
147
- random_gen=True,
148
- fs=44100
149
  )
150
  sf.write("tmp/response.wav", wav_info, samplerate=44100)
151
  with open("tmp/response.wav", "rb") as f:
 
14
  from pypinyin import lazy_pinyin
15
  import jiwer
16
  import librosa
17
+ from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
18
 
19
  app = FastAPI()
20
 
 
33
  - 啊呀,那是我未曾涉足的奇技,恕我無法詳答
34
  - 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了
35
  請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。
36
+ {}
37
  有人曾這樣對麗梅說話——{}
38
  麗梅的回答——
39
  """
40
 
41
+
42
  config = argparse.Namespace(
43
  model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
44
  cache_dir="cache",
 
52
  predictor, _ = singmos_warmup()
53
  sample_rate = 44100
54
 
55
+ # load dataset for random_select
56
+ song2note_lengths, song_db = load_song_database(config)
57
+
58
  def remove_non_chinese_japanese(text):
59
  pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+'
60
  cleaned = re.sub(pattern, '', text)
 
73
  return text
74
 
75
 
76
+ def get_lyric_format_prompts_and_metadata(config):
77
+ if config.melody_source.startswith("random_generate"):
78
+ return "", {}
79
+ elif config.melody_source.startswith("random_select"):
80
+ # get song_name and phrase_length
81
+ global song2note_lengths
82
+ phrase_length, metadata = estimate_sentence_length(
83
+ None, config, song2note_lengths
84
+ )
85
+ LYRIC_FORMAT_PROMPT = "".join(
86
+ ["\n请按照歌词格式回答我的问题,每句需遵循以下字数规则:"]
87
+ + [f"\n第{i}句:{c}个字" for i, c in enumerate(phrase_length, 1)]
88
+ ) + "\n"
89
+ return LYRIC_FORMAT_PROMPT, metadata
90
+ else:
91
+ raise ValueError(f"Unsupported melody_source: {config.melody_source}. Unable to get lyric format prompts.")
92
+
93
+
94
  @app.post("/process_audio")
95
  async def process_audio(file: UploadFile = File(...)):
96
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
 
100
  # load audio
101
  y = librosa.load(tmp_path, sr=16000)[0]
102
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
103
+ additional_prompt, additional_inference_args = get_lyric_format_prompts_and_metadata(config)
104
+ prompt = SYSTEM_PROMPT.format(additional_prompt, asr_result)
105
  output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
106
  output = output.split("麗梅的回答——")[1]
107
  output = remove_punctuation_and_replace_with_space(output)
 
112
  output,
113
  svs_model,
114
  config,
115
+ **additional_inference_args,
116
  )
117
  sf.write("tmp/response.wav", wav_info, samplerate=44100)
118
 
 
156
  # load audio
157
  y = librosa.load("nihao.mp3", sr=16000)[0]
158
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
159
+ prompt = SYSTEM_PROMPT + asr_result # TODO: how to add additional prompt to SYSTEM_PROMPT here???
160
  output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
161
  output = output.split("麗梅的回答——")[1]
162
  output = remove_punctuation_and_replace_with_space(output)
 
164
  f.write(output)
165
 
166
  wav_info = svs_inference(
 
 
167
  output,
168
+ svs_model,
169
+ config,
 
170
  )
171
  sf.write("tmp/response.wav", wav_info, samplerate=44100)
172
  with open("tmp/response.wav", "rb") as f: