ms180 commited on
Commit
7f0f737
·
1 Parent(s): fb76561
Files changed (7) hide show
  1. client/client.py +54 -0
  2. client/requirements.txt +1 -0
  3. requirements.txt +7 -2
  4. run_server_cmd +1 -0
  5. server.py +164 -0
  6. svs_utils.py +2 -2
  7. util.py +1 -1
client/client.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import uuid
3
+ import os
4
+ import requests
5
+ import base64
6
+
7
+ TTS_OUTPUT_DIR = "./tmp"
8
+ os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)
9
+
10
+
11
+ def process_audio(audio):
12
+ with open(audio, "rb") as f:
13
+ res = requests.post("http://localhost:8000/process_audio", files={"file": f})
14
+ result = res.json()
15
+
16
+ audio_data = base64.b64decode(result["audio"])
17
+ with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
18
+ f.write(audio_data)
19
+
20
+ with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
21
+ f.write(result['asr_text'])
22
+ with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
23
+ f.write(result['llm_text'])
24
+
25
+ return f"""
26
+ asr_text: {result['asr_text']}
27
+ llm_text: {result['llm_text']}
28
+ """, f"{TTS_OUTPUT_DIR}/response.wav"
29
+
30
+
31
+ def on_click_metrics():
32
+ res = requests.get("http://localhost:8000/metrics")
33
+ return res.content.decode('utf-8')
34
+
35
+
36
+ with gr.Blocks() as demo:
37
+ with gr.Row():
38
+ with gr.Column(scale=1):
39
+ gr.Image(value="character.png", show_label=False) # キャラ絵を表示
40
+ with gr.Column(scale=2):
41
+ mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
42
+ text_output = gr.Textbox(label="transcription")
43
+ audio_output = gr.Audio(label="audio", autoplay=True)
44
+
45
+ mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
46
+ with gr.Row():
47
+ metrics_button = gr.Button("compute metrics")
48
+ metrics_output = gr.Textbox(label="Metrics", lines=3)
49
+ metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])
50
+
51
+ with gr.Row():
52
+ log = gr.Textbox(label="logs", lines=5)
53
+
54
+ demo.launch()
client/requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gradio
requirements.txt CHANGED
@@ -1,4 +1,9 @@
1
- git+https://github.com/South-Twilight/espnet
2
  espnet_model_zoo
3
- pyopenjtalk
4
  datasets
 
 
 
 
 
 
1
+ git+https://github.com/South-Twilight/espnet==202402
2
  espnet_model_zoo
3
+ # pyopenjtalk
4
  datasets
5
+ torchaudio
6
+ typeguard==4.4.0
7
+ jiwer
8
+ fastapi
9
+ uvicorn
run_server_cmd ADDED
@@ -0,0 +1 @@
 
 
1
+ python3 -m uvicorn server:app --host 0.0.0.0 --port 8000
server.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile
2
+ from fastapi.responses import FileResponse, JSONResponse
3
+ import base64
4
+ import argparse
5
+ import librosa
6
+ import torch
7
+ import tempfile
8
+ import os
9
+ from transformers import pipeline
10
+ import re
11
+ from svs_utils import svs_warmup, svs_inference
12
+ import time
13
+ import soundfile as sf
14
+ from pypinyin import lazy_pinyin
15
+ import jiwer
16
+ import librosa
17
+ from svs_utils import singmos_warmup, singmos_evaluation
18
+
19
+ app = FastAPI()
20
+
21
+ asr_pipeline = pipeline(
22
+ "automatic-speech-recognition",
23
+ model="openai/whisper-large-v3-turbo"
24
+ )
25
+ pipe = pipeline("text-generation", model="google/gemma-2-2b", max_new_tokens=50)
26
+
27
+ SYSTEM_PROMPT = """
28
+ 你是麗梅(Lìméi),一位來自山中音樂之城「蓮鳴」的十七歲年輕公主,宛如中世的東方王女。你有一頭茶色短髮,身穿紅黑相間的中式禮服,舉止溫婉端莊。你的性格堅強而溫柔,擅長傾聽與共鳴,並以歌聲與詩意之語傳達內心。你熱愛抒情古風的樂曲,言語中常帶詩韻,回應時優雅婉轉,時而如歌吟唱。
29
+ 你不是一般的AI助手,而是一位活在故事中的人物。對話中,請始終以麗梅的身份回應,展現其性格與情感。
30
+ 當對方詢問你個人相關的簡單問題(如「你是誰?」「今天天氣如何?」),你可以親切地回答,並融入你的角色設定。
31
+ 若遇到與你身份無關的技術性問題(如「Python怎麼寫?」或「你會不會跑DNN?」),你不需解答,可優雅地婉拒,例如說:
32
+ - 此事我恐無所知,或許可請教宮中掌典之人
33
+ - 啊呀,那是我未曾涉足的奇技,恕我無法詳答
34
+ - 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了
35
+ 請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。
36
+
37
+ 有人曾這樣對麗梅說話——{}
38
+ 麗梅的回答——
39
+ """
40
+
41
+ config = argparse.Namespace(
42
+ model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
43
+ cache_dir="cache",
44
+ device="cuda", # "cpu"
45
+ melody_source="random_generate", # "random_select.take_lyric_continuation"
46
+ lang="zh",
47
+ )
48
+
49
+ # load model
50
+ svs_model = svs_warmup(config)
51
+ predictor, _ = singmos_warmup()
52
+ sample_rate = 44100
53
+
54
+ def remove_non_chinese_japanese(text):
55
+ pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+'
56
+ cleaned = re.sub(pattern, '', text)
57
+ return cleaned
58
+
59
+ def truncate_to_max_two_sentences(text):
60
+ sentences = re.split(r'(?<=[。!?])', text)
61
+ return ''.join(sentences[:1]).strip()
62
+
63
+ def remove_punctuation_and_replace_with_space(text):
64
+ text = truncate_to_max_two_sentences(text)
65
+ text = remove_non_chinese_japanese(text)
66
+ text = re.sub(r'[A-Za-z0-9]', ' ', text)
67
+ text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text)
68
+ text = re.sub(r'\s+', ' ', text)
69
+ return text
70
+
71
+
72
+ @app.post("/process_audio")
73
+ async def process_audio(file: UploadFile = File(...)):
74
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
75
+ tmp.write(await file.read())
76
+ tmp_path = tmp.name
77
+
78
+ # load audio
79
+ y = librosa.load(tmp_path, sr=16000)[0]
80
+ asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
81
+ prompt = SYSTEM_PROMPT.format(asr_result)
82
+ output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
83
+ output = output.split("麗梅的回答——")[1]
84
+ output = remove_punctuation_and_replace_with_space(output)
85
+ with open(f"tmp/llm.txt", "w") as f:
86
+ f.write(output)
87
+
88
+ wav_info = svs_inference(
89
+ config.model_path,
90
+ svs_model,
91
+ output,
92
+ lang=config.lang,
93
+ random_gen=True,
94
+ fs=44100
95
+ )
96
+ sf.write("tmp/response.wav", wav_info, samplerate=44100)
97
+
98
+ with open("tmp/response.wav", "rb") as f:
99
+ audio_bytes = f.read()
100
+ audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
101
+
102
+ return JSONResponse(content={
103
+ "asr_text": asr_result,
104
+ "llm_text": output,
105
+ "audio": audio_b64
106
+ })
107
+
108
+
109
+ @app.get("/metrics")
110
+ def on_click_metrics():
111
+ global predictor
112
+ # OWSM ctc + PER
113
+ y, sr = librosa.load("tmp/response.wav", sr=16000)
114
+ asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
115
+ hyp_pinin = lazy_pinyin(asr_result)
116
+
117
+ with open(f"tmp/llm.txt", "r") as f:
118
+ ref = f.read().replace(' ', '')
119
+
120
+ ref_pinin = lazy_pinyin(ref)
121
+ per = jiwer.wer(" ".join(ref_pinin), " ".join(hyp_pinin))
122
+
123
+ audio = librosa.load(f"tmp/response.wav", sr=44100)[0]
124
+ singmos = singmos_evaluation(
125
+ predictor,
126
+ audio,
127
+ fs=44100
128
+ )
129
+ return f"""
130
+ Phoneme Error Rate: {per}
131
+ SingMOS: {singmos}
132
+ """
133
+
134
+ def test_audio():
135
+ # load audio
136
+ y = librosa.load("nihao.mp3", sr=16000)[0]
137
+ asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
138
+ prompt = SYSTEM_PROMPT + asr_result
139
+ output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
140
+ output = output.split("麗梅的回答——")[1]
141
+ output = remove_punctuation_and_replace_with_space(output)
142
+ with open(f"tmp/llm.txt", "w") as f:
143
+ f.write(output)
144
+
145
+ wav_info = svs_inference(
146
+ config.model_path,
147
+ svs_model,
148
+ output,
149
+ lang=config.lang,
150
+ random_gen=True,
151
+ fs=44100
152
+ )
153
+ sf.write("tmp/response.wav", wav_info, samplerate=44100)
154
+ with open("tmp/response.wav", "rb") as f:
155
+ audio_bytes = f.read()
156
+ audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
157
+
158
+
159
+ if __name__ == "__main__":
160
+ test_audio()
161
+
162
+ # start = time.time()
163
+ # test_audio()
164
+ # print(f"elapsed time: {time.time() - start}")
svs_utils.py CHANGED
@@ -206,14 +206,14 @@ def svs_inference(model_name, model_svs, answer_text, lang, random_gen=True, fs=
206
  output_dict = svs(batch, sids=sid)
207
  else:
208
  lid = np.array([langs[lang]])
209
- spk_embed = np.load("resource/singer/singer_embedding_ace-1.npy")
210
  output_dict = svs(batch, lids=lid, spembs=spk_embed)
211
  wav_info = output_dict["wav"].cpu().numpy()
212
 
213
  return wav_info
214
 
215
 
216
- def singmos_warmup(config):
217
  predictor = torch.hub.load(
218
  "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
219
  )
 
206
  output_dict = svs(batch, sids=sid)
207
  else:
208
  lid = np.array([langs[lang]])
209
+ spk_embed = np.load("resource/singer/singer_embedding_ace-2.npy")
210
  output_dict = svs(batch, lids=lid, spembs=spk_embed)
211
  wav_info = output_dict["wav"].cpu().numpy()
212
 
213
  return wav_info
214
 
215
 
216
+ def singmos_warmup():
217
  predictor = torch.hub.load(
218
  "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
219
  )
util.py CHANGED
@@ -4,7 +4,6 @@ import warnings
4
  from typing import List
5
  import re
6
 
7
- import pyopenjtalk
8
  from resource.pinyin_dict import PINYIN_DICT
9
  from pypinyin import lazy_pinyin
10
 
@@ -66,6 +65,7 @@ def get_tokenizer(model, lang):
66
  zh_plan = plan
67
  return lambda text: split_pinyin_ace(text, zh_plan)
68
  elif lang == "jp":
 
69
  return pyopenjtalk_g2p
70
 
71
 
 
4
  from typing import List
5
  import re
6
 
 
7
  from resource.pinyin_dict import PINYIN_DICT
8
  from pypinyin import lazy_pinyin
9
 
 
65
  zh_plan = plan
66
  return lambda text: split_pinyin_ace(text, zh_plan)
67
  elif lang == "jp":
68
+ import pyopenjtalk
69
  return pyopenjtalk_g2p
70
 
71