Spaces:
Sleeping
Sleeping
Add demo and run script
Browse files- character.png +3 -0
- client.py +58 -0
- path.sh +3 -0
- run_server.sh +14 -0
- server.py +29 -20
- svs_utils.py +6 -6
character.png
ADDED
|
Git LFS Details
|
client.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import uuid
|
| 3 |
+
import os
|
| 4 |
+
import requests
|
| 5 |
+
import base64
|
| 6 |
+
from server import (
|
| 7 |
+
on_click_metrics as server_metrics,
|
| 8 |
+
process_audio as server_process_audio
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
TTS_OUTPUT_DIR = "./tmp"
|
| 12 |
+
os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def process_audio(audio_path):
|
| 16 |
+
# We have audio_path
|
| 17 |
+
result = server_process_audio(audio_path)
|
| 18 |
+
|
| 19 |
+
audio_data = base64.b64decode(result["audio"])
|
| 20 |
+
with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
|
| 21 |
+
f.write(audio_data)
|
| 22 |
+
|
| 23 |
+
with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
|
| 24 |
+
f.write(result['asr_text'])
|
| 25 |
+
with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
|
| 26 |
+
f.write(result['llm_text'])
|
| 27 |
+
|
| 28 |
+
return f"""
|
| 29 |
+
asr_text: {result['asr_text']}
|
| 30 |
+
llm_text: {result['llm_text']}
|
| 31 |
+
""", f"{TTS_OUTPUT_DIR}/response.wav"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def on_click_metrics():
|
| 35 |
+
res = server_metrics()
|
| 36 |
+
return res.content.decode('utf-8')
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
with gr.Blocks() as demo:
|
| 40 |
+
with gr.Row():
|
| 41 |
+
with gr.Column(scale=1):
|
| 42 |
+
gr.Image(value="character.png", show_label=False) # キャラ絵を表示
|
| 43 |
+
with gr.Column(scale=2):
|
| 44 |
+
mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
|
| 45 |
+
text_output = gr.Textbox(label="transcription")
|
| 46 |
+
audio_output = gr.Audio(label="audio", autoplay=True)
|
| 47 |
+
|
| 48 |
+
mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
|
| 49 |
+
with gr.Row():
|
| 50 |
+
metrics_button = gr.Button("compute metrics")
|
| 51 |
+
metrics_output = gr.Textbox(label="Metrics", lines=3)
|
| 52 |
+
metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])
|
| 53 |
+
|
| 54 |
+
with gr.Row():
|
| 55 |
+
log = gr.Textbox(label="logs", lines=5)
|
| 56 |
+
|
| 57 |
+
demo.launch(share=True)
|
| 58 |
+
# demo.launch()
|
path.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
. ~/workspace/SingingSDS/activate_python.sh
|
run_server.sh
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#SBATCH -N 1
|
| 3 |
+
#SBATCH -p general
|
| 4 |
+
#SBATCH --gres=gpu:1
|
| 5 |
+
#SBATCH -t 48:00:00
|
| 6 |
+
#SBATCH --ntasks-per-node=1
|
| 7 |
+
#SBATCH --cpus-per-task=4
|
| 8 |
+
#SBATCH --mem=16G
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
. path.sh
|
| 12 |
+
. ../path.sh
|
| 13 |
+
|
| 14 |
+
python client.py
|
server.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
from fastapi import FastAPI, File, UploadFile
|
| 2 |
-
from fastapi.responses import FileResponse, JSONResponse
|
| 3 |
import base64
|
| 4 |
import argparse
|
| 5 |
import librosa
|
|
@@ -16,7 +14,6 @@ import jiwer
|
|
| 16 |
import librosa
|
| 17 |
from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
|
| 18 |
|
| 19 |
-
app = FastAPI()
|
| 20 |
|
| 21 |
asr_pipeline = pipeline(
|
| 22 |
"automatic-speech-recognition",
|
|
@@ -43,14 +40,15 @@ config = argparse.Namespace(
|
|
| 43 |
model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
| 44 |
cache_dir="cache",
|
| 45 |
device="cuda", # "cpu"
|
| 46 |
-
melody_source="
|
| 47 |
lang="zh",
|
|
|
|
| 48 |
)
|
| 49 |
|
| 50 |
# load model
|
| 51 |
svs_model = svs_warmup(config)
|
| 52 |
predictor, _ = singmos_warmup()
|
| 53 |
-
sample_rate =
|
| 54 |
|
| 55 |
# load dataset for random_select
|
| 56 |
song2note_lengths, song_db = load_song_database(config)
|
|
@@ -71,33 +69,40 @@ def remove_punctuation_and_replace_with_space(text):
|
|
| 71 |
text = re.sub(r'[A-Za-z0-9]', ' ', text)
|
| 72 |
text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text)
|
| 73 |
text = re.sub(r'\s+', ' ', text)
|
|
|
|
| 74 |
return text
|
| 75 |
|
| 76 |
|
| 77 |
def get_lyric_format_prompts_and_metadata(config):
|
|
|
|
| 78 |
if config.melody_source.startswith("random_generate"):
|
| 79 |
return "", {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
elif config.melody_source.startswith("random_select"):
|
| 81 |
# get song_name and phrase_length
|
| 82 |
-
global song2note_lengths
|
| 83 |
phrase_length, metadata = estimate_sentence_length(
|
| 84 |
None, config, song2note_lengths
|
| 85 |
)
|
| 86 |
lyric_format_prompt = (
|
| 87 |
"\n请按照歌词格式回答我的问题,每句需遵循以下字数规则:"
|
| 88 |
-
+ "".join(
|
| 89 |
+ "\n如果没有足够的信息回答,请使用最少的句子,不要重复、不要扩展、不要加入无关内容。\n"
|
| 90 |
)
|
| 91 |
-
|
|
|
|
| 92 |
else:
|
| 93 |
raise ValueError(f"Unsupported melody_source: {config.melody_source}. Unable to get lyric format prompts.")
|
| 94 |
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
tmp_path = tmp.name
|
| 101 |
|
| 102 |
# load audio
|
| 103 |
y = librosa.load(tmp_path, sr=16000)[0]
|
|
@@ -116,20 +121,24 @@ async def process_audio(file: UploadFile = File(...)):
|
|
| 116 |
config,
|
| 117 |
**additional_inference_args,
|
| 118 |
)
|
| 119 |
-
sf.write("tmp/response.wav", wav_info, samplerate=
|
| 120 |
|
| 121 |
with open("tmp/response.wav", "rb") as f:
|
| 122 |
audio_bytes = f.read()
|
| 123 |
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
| 124 |
|
| 125 |
-
return
|
| 126 |
"asr_text": asr_result,
|
| 127 |
"llm_text": output,
|
| 128 |
"audio": audio_b64
|
| 129 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
-
@app.get("/metrics")
|
| 133 |
def on_click_metrics():
|
| 134 |
global predictor
|
| 135 |
# OWSM ctc + PER
|
|
@@ -143,11 +152,11 @@ def on_click_metrics():
|
|
| 143 |
ref_pinin = lazy_pinyin(ref)
|
| 144 |
per = jiwer.wer(" ".join(ref_pinin), " ".join(hyp_pinin))
|
| 145 |
|
| 146 |
-
audio = librosa.load(f"tmp/response.wav", sr=
|
| 147 |
singmos = singmos_evaluation(
|
| 148 |
predictor,
|
| 149 |
audio,
|
| 150 |
-
fs=
|
| 151 |
)
|
| 152 |
return f"""
|
| 153 |
Phoneme Error Rate: {per}
|
|
@@ -170,7 +179,7 @@ def test_audio():
|
|
| 170 |
svs_model,
|
| 171 |
config,
|
| 172 |
)
|
| 173 |
-
sf.write("tmp/response.wav", wav_info, samplerate=
|
| 174 |
with open("tmp/response.wav", "rb") as f:
|
| 175 |
audio_bytes = f.read()
|
| 176 |
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
|
|
|
|
|
|
|
|
|
| 1 |
import base64
|
| 2 |
import argparse
|
| 3 |
import librosa
|
|
|
|
| 14 |
import librosa
|
| 15 |
from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
|
| 16 |
|
|
|
|
| 17 |
|
| 18 |
asr_pipeline = pipeline(
|
| 19 |
"automatic-speech-recognition",
|
|
|
|
| 40 |
model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
| 41 |
cache_dir="cache",
|
| 42 |
device="cuda", # "cpu"
|
| 43 |
+
melody_source="random_select.touhou", # "random_select.take_lyric_continuation"
|
| 44 |
lang="zh",
|
| 45 |
+
speaker="resource/singer/singer_embedding_ace-2.npy",
|
| 46 |
)
|
| 47 |
|
| 48 |
# load model
|
| 49 |
svs_model = svs_warmup(config)
|
| 50 |
predictor, _ = singmos_warmup()
|
| 51 |
+
sample_rate = 48000
|
| 52 |
|
| 53 |
# load dataset for random_select
|
| 54 |
song2note_lengths, song_db = load_song_database(config)
|
|
|
|
| 69 |
text = re.sub(r'[A-Za-z0-9]', ' ', text)
|
| 70 |
text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text)
|
| 71 |
text = re.sub(r'\s+', ' ', text)
|
| 72 |
+
text = " ".join(text.split()[:2])
|
| 73 |
return text
|
| 74 |
|
| 75 |
|
| 76 |
def get_lyric_format_prompts_and_metadata(config):
|
| 77 |
+
global song2note_lengths
|
| 78 |
if config.melody_source.startswith("random_generate"):
|
| 79 |
return "", {}
|
| 80 |
+
elif config.melody_source.startswith("random_select.touhou"):
|
| 81 |
+
phrase_length, metadata = estimate_sentence_length(
|
| 82 |
+
None, config, song2note_lengths
|
| 83 |
+
)
|
| 84 |
+
additional_kwargs = {"song_db": song_db, "metadata": metadata}
|
| 85 |
+
return "", additional_kwargs
|
| 86 |
elif config.melody_source.startswith("random_select"):
|
| 87 |
# get song_name and phrase_length
|
|
|
|
| 88 |
phrase_length, metadata = estimate_sentence_length(
|
| 89 |
None, config, song2note_lengths
|
| 90 |
)
|
| 91 |
lyric_format_prompt = (
|
| 92 |
"\n请按照歌词格式回答我的问题,每句需遵循以下字数规则:"
|
| 93 |
+
+ "".join([f"\n第{i}句:{c}个字" for i, c in enumerate(phrase_length, 1)])
|
| 94 |
+ "\n如果没有足够的信息回答,请使用最少的句子,不要重复、不要扩展、不要加入无关内容。\n"
|
| 95 |
)
|
| 96 |
+
additional_kwargs = {"song_db": song_db, "metadata": metadata}
|
| 97 |
+
return lyric_format_prompt, additional_kwargs
|
| 98 |
else:
|
| 99 |
raise ValueError(f"Unsupported melody_source: {config.melody_source}. Unable to get lyric format prompts.")
|
| 100 |
|
| 101 |
|
| 102 |
+
def process_audio(tmp_path):
|
| 103 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
| 104 |
+
# tmp.write(await file.read())
|
| 105 |
+
# tmp_path = tmp.name
|
|
|
|
| 106 |
|
| 107 |
# load audio
|
| 108 |
y = librosa.load(tmp_path, sr=16000)[0]
|
|
|
|
| 121 |
config,
|
| 122 |
**additional_inference_args,
|
| 123 |
)
|
| 124 |
+
sf.write("tmp/response.wav", wav_info, samplerate=sample_rate)
|
| 125 |
|
| 126 |
with open("tmp/response.wav", "rb") as f:
|
| 127 |
audio_bytes = f.read()
|
| 128 |
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
| 129 |
|
| 130 |
+
return {
|
| 131 |
"asr_text": asr_result,
|
| 132 |
"llm_text": output,
|
| 133 |
"audio": audio_b64
|
| 134 |
+
}
|
| 135 |
+
# return JSONResponse(content={
|
| 136 |
+
# "asr_text": asr_result,
|
| 137 |
+
# "llm_text": output,
|
| 138 |
+
# "audio": audio_b64
|
| 139 |
+
# })
|
| 140 |
|
| 141 |
|
|
|
|
| 142 |
def on_click_metrics():
|
| 143 |
global predictor
|
| 144 |
# OWSM ctc + PER
|
|
|
|
| 152 |
ref_pinin = lazy_pinyin(ref)
|
| 153 |
per = jiwer.wer(" ".join(ref_pinin), " ".join(hyp_pinin))
|
| 154 |
|
| 155 |
+
audio = librosa.load(f"tmp/response.wav", sr=sample_rate)[0]
|
| 156 |
singmos = singmos_evaluation(
|
| 157 |
predictor,
|
| 158 |
audio,
|
| 159 |
+
fs=sample_rate
|
| 160 |
)
|
| 161 |
return f"""
|
| 162 |
Phoneme Error Rate: {per}
|
|
|
|
| 179 |
svs_model,
|
| 180 |
config,
|
| 181 |
)
|
| 182 |
+
sf.write("tmp/response.wav", wav_info, samplerate=sample_rate)
|
| 183 |
with open("tmp/response.wav", "rb") as f:
|
| 184 |
audio_bytes = f.read()
|
| 185 |
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
|
svs_utils.py
CHANGED
|
@@ -324,8 +324,8 @@ def load_list_from_json(json_path):
|
|
| 324 |
data = [
|
| 325 |
{
|
| 326 |
"tempo": d["tempo"],
|
| 327 |
-
"note_start_times": [n[0] * (
|
| 328 |
-
"note_end_times": [n[1] * (
|
| 329 |
"note_lyrics": ["" for n in d["score"]],
|
| 330 |
"note_midi": [n[2] for n in d["score"]],
|
| 331 |
}
|
|
@@ -348,8 +348,8 @@ def song_segment_iterator(song_db, metadata):
|
|
| 348 |
elif song_name.startswith("touhou"):
|
| 349 |
# return a iterator that load from touhou musics
|
| 350 |
data = load_list_from_json("data/touhou/note_data.json")
|
| 351 |
-
|
| 352 |
-
yield
|
| 353 |
else:
|
| 354 |
raise NotImplementedError(f"song name {song_name} not supported")
|
| 355 |
|
|
@@ -380,7 +380,7 @@ if __name__ == "__main__":
|
|
| 380 |
cache_dir="cache",
|
| 381 |
device="cuda", # "cpu"
|
| 382 |
melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation", "random_select.touhou"
|
| 383 |
-
lang="
|
| 384 |
speaker="resource/singer/singer_embedding_ace-2.npy",
|
| 385 |
)
|
| 386 |
|
|
@@ -390,7 +390,7 @@ if __name__ == "__main__":
|
|
| 390 |
if config.lang == "zh":
|
| 391 |
answer_text = "天气真好\n空气清新\n气温温和\n风和日丽\n天高气爽\n阳光明媚"
|
| 392 |
elif config.lang == "jp":
|
| 393 |
-
answer_text = "
|
| 394 |
else:
|
| 395 |
print(f"Currently system does not support {config.lang}")
|
| 396 |
exit(1)
|
|
|
|
| 324 |
data = [
|
| 325 |
{
|
| 326 |
"tempo": d["tempo"],
|
| 327 |
+
"note_start_times": [n[0] * (100/d["tempo"]) for n in d["score"]],
|
| 328 |
+
"note_end_times": [n[1] * (100/d["tempo"]) for n in d["score"]],
|
| 329 |
"note_lyrics": ["" for n in d["score"]],
|
| 330 |
"note_midi": [n[2] for n in d["score"]],
|
| 331 |
}
|
|
|
|
| 348 |
elif song_name.startswith("touhou"):
|
| 349 |
# return a iterator that load from touhou musics
|
| 350 |
data = load_list_from_json("data/touhou/note_data.json")
|
| 351 |
+
while True:
|
| 352 |
+
yield random.choice(data)
|
| 353 |
else:
|
| 354 |
raise NotImplementedError(f"song name {song_name} not supported")
|
| 355 |
|
|
|
|
| 380 |
cache_dir="cache",
|
| 381 |
device="cuda", # "cpu"
|
| 382 |
melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation", "random_select.touhou"
|
| 383 |
+
lang="zh",
|
| 384 |
speaker="resource/singer/singer_embedding_ace-2.npy",
|
| 385 |
)
|
| 386 |
|
|
|
|
| 390 |
if config.lang == "zh":
|
| 391 |
answer_text = "天气真好\n空气清新\n气温温和\n风和日丽\n天高气爽\n阳光明媚"
|
| 392 |
elif config.lang == "jp":
|
| 393 |
+
answer_text = "流れてく時の中ででもけだるさが"
|
| 394 |
else:
|
| 395 |
print(f"Currently system does not support {config.lang}")
|
| 396 |
exit(1)
|