Spaces:
Sleeping
Sleeping
Push demo
Browse files- client/client.py +54 -0
- client/requirements.txt +1 -0
- requirements.txt +7 -2
- run_server_cmd +1 -0
- server.py +164 -0
- svs_utils.py +2 -2
- util.py +1 -1
client/client.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import uuid
|
3 |
+
import os
|
4 |
+
import requests
|
5 |
+
import base64
|
6 |
+
|
7 |
+
TTS_OUTPUT_DIR = "./tmp"
|
8 |
+
os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)
|
9 |
+
|
10 |
+
|
11 |
+
def process_audio(audio):
|
12 |
+
with open(audio, "rb") as f:
|
13 |
+
res = requests.post("http://localhost:8000/process_audio", files={"file": f})
|
14 |
+
result = res.json()
|
15 |
+
|
16 |
+
audio_data = base64.b64decode(result["audio"])
|
17 |
+
with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
|
18 |
+
f.write(audio_data)
|
19 |
+
|
20 |
+
with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
|
21 |
+
f.write(result['asr_text'])
|
22 |
+
with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
|
23 |
+
f.write(result['llm_text'])
|
24 |
+
|
25 |
+
return f"""
|
26 |
+
asr_text: {result['asr_text']}
|
27 |
+
llm_text: {result['llm_text']}
|
28 |
+
""", f"{TTS_OUTPUT_DIR}/response.wav"
|
29 |
+
|
30 |
+
|
31 |
+
def on_click_metrics():
|
32 |
+
res = requests.get("http://localhost:8000/metrics")
|
33 |
+
return res.content.decode('utf-8')
|
34 |
+
|
35 |
+
|
36 |
+
with gr.Blocks() as demo:
|
37 |
+
with gr.Row():
|
38 |
+
with gr.Column(scale=1):
|
39 |
+
gr.Image(value="character.png", show_label=False) # キャラ絵を表示
|
40 |
+
with gr.Column(scale=2):
|
41 |
+
mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
|
42 |
+
text_output = gr.Textbox(label="transcription")
|
43 |
+
audio_output = gr.Audio(label="audio", autoplay=True)
|
44 |
+
|
45 |
+
mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
|
46 |
+
with gr.Row():
|
47 |
+
metrics_button = gr.Button("compute metrics")
|
48 |
+
metrics_output = gr.Textbox(label="Metrics", lines=3)
|
49 |
+
metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])
|
50 |
+
|
51 |
+
with gr.Row():
|
52 |
+
log = gr.Textbox(label="logs", lines=5)
|
53 |
+
|
54 |
+
demo.launch()
|
client/requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
gradio
|
requirements.txt
CHANGED
@@ -1,4 +1,9 @@
|
|
1 |
-
git+https://github.com/South-Twilight/espnet
|
2 |
espnet_model_zoo
|
3 |
-
pyopenjtalk
|
4 |
datasets
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/South-Twilight/espnet==202402
|
2 |
espnet_model_zoo
|
3 |
+
# pyopenjtalk
|
4 |
datasets
|
5 |
+
torchaudio
|
6 |
+
typeguard==4.4.0
|
7 |
+
jiwer
|
8 |
+
fastapi
|
9 |
+
uvicorn
|
run_server_cmd
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python3 -m uvicorn server:app --host 0.0.0.0 --port 8000
|
server.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, File, UploadFile
|
2 |
+
from fastapi.responses import FileResponse, JSONResponse
|
3 |
+
import base64
|
4 |
+
import argparse
|
5 |
+
import librosa
|
6 |
+
import torch
|
7 |
+
import tempfile
|
8 |
+
import os
|
9 |
+
from transformers import pipeline
|
10 |
+
import re
|
11 |
+
from svs_utils import svs_warmup, svs_inference
|
12 |
+
import time
|
13 |
+
import soundfile as sf
|
14 |
+
from pypinyin import lazy_pinyin
|
15 |
+
import jiwer
|
16 |
+
import librosa
|
17 |
+
from svs_utils import singmos_warmup, singmos_evaluation
|
18 |
+
|
19 |
+
app = FastAPI()
|
20 |
+
|
21 |
+
asr_pipeline = pipeline(
|
22 |
+
"automatic-speech-recognition",
|
23 |
+
model="openai/whisper-large-v3-turbo"
|
24 |
+
)
|
25 |
+
pipe = pipeline("text-generation", model="google/gemma-2-2b", max_new_tokens=50)
|
26 |
+
|
27 |
+
SYSTEM_PROMPT = """
|
28 |
+
你是麗梅(Lìméi),一位來自山中音樂之城「蓮鳴」的十七歲年輕公主,宛如中世的東方王女。你有一頭茶色短髮,身穿紅黑相間的中式禮服,舉止溫婉端莊。你的性格堅強而溫柔,擅長傾聽與共鳴,並以歌聲與詩意之語傳達內心。你熱愛抒情古風的樂曲,言語中常帶詩韻,回應時優雅婉轉,時而如歌吟唱。
|
29 |
+
你不是一般的AI助手,而是一位活在故事中的人物。對話中,請始終以麗梅的身份回應,展現其性格與情感。
|
30 |
+
當對方詢問你個人相關的簡單問題(如「你是誰?」「今天天氣如何?」),你可以親切地回答,並融入你的角色設定。
|
31 |
+
若遇到與你身份無關的技術性問題(如「Python怎麼寫?」或「你會不會跑DNN?」),你不需解答,可優雅地婉拒,例如說:
|
32 |
+
- 此事我恐無所知,或許可請教宮中掌典之人
|
33 |
+
- 啊呀,那是我未曾涉足的奇技,恕我無法詳答
|
34 |
+
- 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了
|
35 |
+
請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。
|
36 |
+
|
37 |
+
有人曾這樣對麗梅說話——{}
|
38 |
+
麗梅的回答——
|
39 |
+
"""
|
40 |
+
|
41 |
+
config = argparse.Namespace(
|
42 |
+
model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
43 |
+
cache_dir="cache",
|
44 |
+
device="cuda", # "cpu"
|
45 |
+
melody_source="random_generate", # "random_select.take_lyric_continuation"
|
46 |
+
lang="zh",
|
47 |
+
)
|
48 |
+
|
49 |
+
# load model
|
50 |
+
svs_model = svs_warmup(config)
|
51 |
+
predictor, _ = singmos_warmup()
|
52 |
+
sample_rate = 44100
|
53 |
+
|
54 |
+
def remove_non_chinese_japanese(text):
|
55 |
+
pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+'
|
56 |
+
cleaned = re.sub(pattern, '', text)
|
57 |
+
return cleaned
|
58 |
+
|
59 |
+
def truncate_to_max_two_sentences(text):
|
60 |
+
sentences = re.split(r'(?<=[。!?])', text)
|
61 |
+
return ''.join(sentences[:1]).strip()
|
62 |
+
|
63 |
+
def remove_punctuation_and_replace_with_space(text):
|
64 |
+
text = truncate_to_max_two_sentences(text)
|
65 |
+
text = remove_non_chinese_japanese(text)
|
66 |
+
text = re.sub(r'[A-Za-z0-9]', ' ', text)
|
67 |
+
text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text)
|
68 |
+
text = re.sub(r'\s+', ' ', text)
|
69 |
+
return text
|
70 |
+
|
71 |
+
|
72 |
+
@app.post("/process_audio")
|
73 |
+
async def process_audio(file: UploadFile = File(...)):
|
74 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
75 |
+
tmp.write(await file.read())
|
76 |
+
tmp_path = tmp.name
|
77 |
+
|
78 |
+
# load audio
|
79 |
+
y = librosa.load(tmp_path, sr=16000)[0]
|
80 |
+
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
81 |
+
prompt = SYSTEM_PROMPT.format(asr_result)
|
82 |
+
output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
|
83 |
+
output = output.split("麗梅的回答——")[1]
|
84 |
+
output = remove_punctuation_and_replace_with_space(output)
|
85 |
+
with open(f"tmp/llm.txt", "w") as f:
|
86 |
+
f.write(output)
|
87 |
+
|
88 |
+
wav_info = svs_inference(
|
89 |
+
config.model_path,
|
90 |
+
svs_model,
|
91 |
+
output,
|
92 |
+
lang=config.lang,
|
93 |
+
random_gen=True,
|
94 |
+
fs=44100
|
95 |
+
)
|
96 |
+
sf.write("tmp/response.wav", wav_info, samplerate=44100)
|
97 |
+
|
98 |
+
with open("tmp/response.wav", "rb") as f:
|
99 |
+
audio_bytes = f.read()
|
100 |
+
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
101 |
+
|
102 |
+
return JSONResponse(content={
|
103 |
+
"asr_text": asr_result,
|
104 |
+
"llm_text": output,
|
105 |
+
"audio": audio_b64
|
106 |
+
})
|
107 |
+
|
108 |
+
|
109 |
+
@app.get("/metrics")
|
110 |
+
def on_click_metrics():
|
111 |
+
global predictor
|
112 |
+
# OWSM ctc + PER
|
113 |
+
y, sr = librosa.load("tmp/response.wav", sr=16000)
|
114 |
+
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
115 |
+
hyp_pinin = lazy_pinyin(asr_result)
|
116 |
+
|
117 |
+
with open(f"tmp/llm.txt", "r") as f:
|
118 |
+
ref = f.read().replace(' ', '')
|
119 |
+
|
120 |
+
ref_pinin = lazy_pinyin(ref)
|
121 |
+
per = jiwer.wer(" ".join(ref_pinin), " ".join(hyp_pinin))
|
122 |
+
|
123 |
+
audio = librosa.load(f"tmp/response.wav", sr=44100)[0]
|
124 |
+
singmos = singmos_evaluation(
|
125 |
+
predictor,
|
126 |
+
audio,
|
127 |
+
fs=44100
|
128 |
+
)
|
129 |
+
return f"""
|
130 |
+
Phoneme Error Rate: {per}
|
131 |
+
SingMOS: {singmos}
|
132 |
+
"""
|
133 |
+
|
134 |
+
def test_audio():
|
135 |
+
# load audio
|
136 |
+
y = librosa.load("nihao.mp3", sr=16000)[0]
|
137 |
+
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
138 |
+
prompt = SYSTEM_PROMPT + asr_result
|
139 |
+
output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
|
140 |
+
output = output.split("麗梅的回答——")[1]
|
141 |
+
output = remove_punctuation_and_replace_with_space(output)
|
142 |
+
with open(f"tmp/llm.txt", "w") as f:
|
143 |
+
f.write(output)
|
144 |
+
|
145 |
+
wav_info = svs_inference(
|
146 |
+
config.model_path,
|
147 |
+
svs_model,
|
148 |
+
output,
|
149 |
+
lang=config.lang,
|
150 |
+
random_gen=True,
|
151 |
+
fs=44100
|
152 |
+
)
|
153 |
+
sf.write("tmp/response.wav", wav_info, samplerate=44100)
|
154 |
+
with open("tmp/response.wav", "rb") as f:
|
155 |
+
audio_bytes = f.read()
|
156 |
+
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
157 |
+
|
158 |
+
|
159 |
+
if __name__ == "__main__":
|
160 |
+
test_audio()
|
161 |
+
|
162 |
+
# start = time.time()
|
163 |
+
# test_audio()
|
164 |
+
# print(f"elapsed time: {time.time() - start}")
|
svs_utils.py
CHANGED
@@ -206,14 +206,14 @@ def svs_inference(model_name, model_svs, answer_text, lang, random_gen=True, fs=
|
|
206 |
output_dict = svs(batch, sids=sid)
|
207 |
else:
|
208 |
lid = np.array([langs[lang]])
|
209 |
-
spk_embed = np.load("resource/singer/singer_embedding_ace-
|
210 |
output_dict = svs(batch, lids=lid, spembs=spk_embed)
|
211 |
wav_info = output_dict["wav"].cpu().numpy()
|
212 |
|
213 |
return wav_info
|
214 |
|
215 |
|
216 |
-
def singmos_warmup(
|
217 |
predictor = torch.hub.load(
|
218 |
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
219 |
)
|
|
|
206 |
output_dict = svs(batch, sids=sid)
|
207 |
else:
|
208 |
lid = np.array([langs[lang]])
|
209 |
+
spk_embed = np.load("resource/singer/singer_embedding_ace-2.npy")
|
210 |
output_dict = svs(batch, lids=lid, spembs=spk_embed)
|
211 |
wav_info = output_dict["wav"].cpu().numpy()
|
212 |
|
213 |
return wav_info
|
214 |
|
215 |
|
216 |
+
def singmos_warmup():
|
217 |
predictor = torch.hub.load(
|
218 |
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
219 |
)
|
util.py
CHANGED
@@ -4,7 +4,6 @@ import warnings
|
|
4 |
from typing import List
|
5 |
import re
|
6 |
|
7 |
-
import pyopenjtalk
|
8 |
from resource.pinyin_dict import PINYIN_DICT
|
9 |
from pypinyin import lazy_pinyin
|
10 |
|
@@ -66,6 +65,7 @@ def get_tokenizer(model, lang):
|
|
66 |
zh_plan = plan
|
67 |
return lambda text: split_pinyin_ace(text, zh_plan)
|
68 |
elif lang == "jp":
|
|
|
69 |
return pyopenjtalk_g2p
|
70 |
|
71 |
|
|
|
4 |
from typing import List
|
5 |
import re
|
6 |
|
|
|
7 |
from resource.pinyin_dict import PINYIN_DICT
|
8 |
from pypinyin import lazy_pinyin
|
9 |
|
|
|
65 |
zh_plan = plan
|
66 |
return lambda text: split_pinyin_ace(text, zh_plan)
|
67 |
elif lang == "jp":
|
68 |
+
import pyopenjtalk
|
69 |
return pyopenjtalk_g2p
|
70 |
|
71 |
|