Spaces:

jhansss
/

SingingSDS

Sleeping

App Files Files Community

ms180 commited on Apr 20

Commit

e3a6e38

1 Parent(s): 427f657

Add demo and run script

Browse files

Files changed (6) hide show

character.png +3 -0
client.py +58 -0
path.sh +3 -0
run_server.sh +14 -0
server.py +29 -20
svs_utils.py +6 -6

character.png ADDED Viewed

Git LFS Details

SHA256: 38dc2981a0ac817f62d8a87a053285535821041e8ead37e77d871b9bc7b3a82d
Pointer size: 132 Bytes
Size of remote file: 1.78 MB

client.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import gradio as gr
+import uuid
+import os
+import requests
+import base64
+from server import (
+    on_click_metrics as server_metrics,
+    process_audio as server_process_audio
+)
+TTS_OUTPUT_DIR = "./tmp"
+os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)
+def process_audio(audio_path):
+    # We have audio_path
+    result = server_process_audio(audio_path)
+    audio_data = base64.b64decode(result["audio"])
+    with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
+        f.write(audio_data)
+    with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
+        f.write(result['asr_text'])
+    with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
+        f.write(result['llm_text'])
+    return f"""
+asr_text: {result['asr_text']}
+llm_text: {result['llm_text']}
+""", f"{TTS_OUTPUT_DIR}/response.wav"
+def on_click_metrics():
+    res = server_metrics()
+    return res.content.decode('utf-8')
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Image(value="character.png", show_label=False)  # キャラ絵を表示
+        with gr.Column(scale=2):
+            mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
+            text_output = gr.Textbox(label="transcription")
+            audio_output = gr.Audio(label="audio", autoplay=True)
+            mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
+    with gr.Row():
+        metrics_button = gr.Button("compute metrics")
+        metrics_output = gr.Textbox(label="Metrics", lines=3)
+        metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])
+    with gr.Row():
+        log = gr.Textbox(label="logs", lines=5)
+demo.launch(share=True)
+# demo.launch()

path.sh ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #!/bin/bash
2	+
3	+ . ~/workspace/SingingSDS/activate_python.sh

run_server.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/bin/bash
+#SBATCH -N 1
+#SBATCH -p general
+#SBATCH --gres=gpu:1
+#SBATCH -t 48:00:00
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+. path.sh
+. ../path.sh
+python client.py

server.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from fastapi import FastAPI, File, UploadFile
-from fastapi.responses import FileResponse, JSONResponse
 import base64
 import argparse
 import librosa
@@ -16,7 +14,6 @@ import jiwer
 import librosa
 from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
-app = FastAPI()
 asr_pipeline = pipeline(
     "automatic-speech-recognition",
@@ -43,14 +40,15 @@ config = argparse.Namespace(
     model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
     cache_dir="cache",
     device="cuda", # "cpu"
-    melody_source="random_generate", # "random_select.take_lyric_continuation"
     lang="zh",
 )
 # load model
 svs_model = svs_warmup(config)
 predictor, _ = singmos_warmup()
-sample_rate = 44100
 # load dataset for random_select
 song2note_lengths, song_db = load_song_database(config)
@@ -71,33 +69,40 @@ def remove_punctuation_and_replace_with_space(text):
     text = re.sub(r'[A-Za-z0-9]', ' ', text)
     text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text)
     text = re.sub(r'\s+', ' ', text)
     return text
 def get_lyric_format_prompts_and_metadata(config):
     if config.melody_source.startswith("random_generate"):
         return "", {}
     elif config.melody_source.startswith("random_select"):
         # get song_name and phrase_length
-        global song2note_lengths
         phrase_length, metadata = estimate_sentence_length(
             None, config, song2note_lengths
         )
         lyric_format_prompt = (
             "\n请按照歌词格式回答我的问题，每句需遵循以下字数规则："
-            + "".join(+[f"\n第{i}句：{c}个字" for i, c in enumerate(phrase_length, 1)])
             + "\n如果没有足够的信息回答，请使用最少的句子，不要重复、不要扩展、不要加入无关内容。\n"
         )
-        return lyric_format_prompt, metadata
     else:
         raise ValueError(f"Unsupported melody_source: {config.melody_source}. Unable to get lyric format prompts.")
-@app.post("/process_audio")
-async def process_audio(file: UploadFile = File(...)):
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
-        tmp.write(await file.read())
-        tmp_path = tmp.name
     # load audio
     y = librosa.load(tmp_path, sr=16000)[0]
@@ -116,20 +121,24 @@ async def process_audio(file: UploadFile = File(...)):
         config,
         **additional_inference_args,
     )
-    sf.write("tmp/response.wav", wav_info, samplerate=44100)
     with open("tmp/response.wav", "rb") as f:
         audio_bytes = f.read()
         audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
-    return JSONResponse(content={
         "asr_text": asr_result,
         "llm_text": output,
         "audio": audio_b64
-    })
-@app.get("/metrics")
 def on_click_metrics():
     global predictor
     # OWSM ctc + PER
@@ -143,11 +152,11 @@ def on_click_metrics():
     ref_pinin = lazy_pinyin(ref)
     per = jiwer.wer(" ".join(ref_pinin), " ".join(hyp_pinin))
-    audio = librosa.load(f"tmp/response.wav", sr=44100)[0]
     singmos = singmos_evaluation(
         predictor,
         audio,
-        fs=44100
     )
     return f"""
 Phoneme Error Rate: {per}
@@ -170,7 +179,7 @@ def test_audio():
         svs_model,
         config,
     )
-    sf.write("tmp/response.wav", wav_info, samplerate=44100)
     with open("tmp/response.wav", "rb") as f:
         audio_bytes = f.read()
         audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")

 import base64
 import argparse
 import librosa
 import librosa
 from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
 asr_pipeline = pipeline(
     "automatic-speech-recognition",
     model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
     cache_dir="cache",
     device="cuda", # "cpu"
+    melody_source="random_select.touhou", # "random_select.take_lyric_continuation"
     lang="zh",
+    speaker="resource/singer/singer_embedding_ace-2.npy",
 )
 # load model
 svs_model = svs_warmup(config)
 predictor, _ = singmos_warmup()
+sample_rate = 48000
 # load dataset for random_select
 song2note_lengths, song_db = load_song_database(config)
     text = re.sub(r'[A-Za-z0-9]', ' ', text)
     text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text)
     text = re.sub(r'\s+', ' ', text)
+    text = " ".join(text.split()[:2])
     return text
 def get_lyric_format_prompts_and_metadata(config):
+    global song2note_lengths
     if config.melody_source.startswith("random_generate"):
         return "", {}
+    elif config.melody_source.startswith("random_select.touhou"):
+        phrase_length, metadata = estimate_sentence_length(
+            None, config, song2note_lengths
+        )
+        additional_kwargs = {"song_db": song_db, "metadata": metadata}
+        return "", additional_kwargs
     elif config.melody_source.startswith("random_select"):
         # get song_name and phrase_length
         phrase_length, metadata = estimate_sentence_length(
             None, config, song2note_lengths
         )
         lyric_format_prompt = (
             "\n请按照歌词格式回答我的问题，每句需遵循以下字数规则："
+            + "".join([f"\n第{i}句：{c}个字" for i, c in enumerate(phrase_length, 1)])
             + "\n如果没有足够的信息回答，请使用最少的句子，不要重复、不要扩展、不要加入无关内容。\n"
         )
+        additional_kwargs = {"song_db": song_db, "metadata": metadata}
+        return lyric_format_prompt, additional_kwargs
     else:
         raise ValueError(f"Unsupported melody_source: {config.melody_source}. Unable to get lyric format prompts.")
+def process_audio(tmp_path):
+    # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+    #     tmp.write(await file.read())
+    #     tmp_path = tmp.name
     # load audio
     y = librosa.load(tmp_path, sr=16000)[0]
         config,
         **additional_inference_args,
     )
+    sf.write("tmp/response.wav", wav_info, samplerate=sample_rate)
     with open("tmp/response.wav", "rb") as f:
         audio_bytes = f.read()
         audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
+    return {
         "asr_text": asr_result,
         "llm_text": output,
         "audio": audio_b64
+    }
+    # return JSONResponse(content={
+    #     "asr_text": asr_result,
+    #     "llm_text": output,
+    #     "audio": audio_b64
+    # })
 def on_click_metrics():
     global predictor
     # OWSM ctc + PER
     ref_pinin = lazy_pinyin(ref)
     per = jiwer.wer(" ".join(ref_pinin), " ".join(hyp_pinin))
+    audio = librosa.load(f"tmp/response.wav", sr=sample_rate)[0]
     singmos = singmos_evaluation(
         predictor,
         audio,
+        fs=sample_rate
     )
     return f"""
 Phoneme Error Rate: {per}
         svs_model,
         config,
     )
+    sf.write("tmp/response.wav", wav_info, samplerate=sample_rate)
     with open("tmp/response.wav", "rb") as f:
         audio_bytes = f.read()
         audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")

svs_utils.py CHANGED Viewed

@@ -324,8 +324,8 @@ def load_list_from_json(json_path):
     data = [
         {
             "tempo": d["tempo"],
-            "note_start_times": [n[0] * (145/d["tempo"]) for n in d["score"]],
-            "note_end_times": [n[1] * (145/d["tempo"]) for n in d["score"]],
             "note_lyrics": ["" for n in d["score"]],
             "note_midi": [n[2] for n in d["score"]],
         }
@@ -348,8 +348,8 @@ def song_segment_iterator(song_db, metadata):
     elif song_name.startswith("touhou"):
         # return a iterator that load from touhou musics
         data = load_list_from_json("data/touhou/note_data.json")
-        for d in data:
-            yield d
     else:
         raise NotImplementedError(f"song name {song_name} not supported")
@@ -380,7 +380,7 @@ if __name__ == "__main__":
         cache_dir="cache",
         device="cuda", # "cpu"
         melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation",  "random_select.touhou"
-        lang="jp",
         speaker="resource/singer/singer_embedding_ace-2.npy",
     )
@@ -390,7 +390,7 @@ if __name__ == "__main__":
     if config.lang == "zh":
         answer_text = "天气真好\n空气清新\n气温温和\n风和日丽\n天高气爽\n阳光明媚"
     elif config.lang == "jp":
-        answer_text = "世界で一番おひめさま そういう扱い心得てよね"
     else:
         print(f"Currently system does not support {config.lang}")
         exit(1)

     data = [
         {
             "tempo": d["tempo"],
+            "note_start_times": [n[0] * (100/d["tempo"]) for n in d["score"]],
+            "note_end_times": [n[1] * (100/d["tempo"]) for n in d["score"]],
             "note_lyrics": ["" for n in d["score"]],
             "note_midi": [n[2] for n in d["score"]],
         }
     elif song_name.startswith("touhou"):
         # return a iterator that load from touhou musics
         data = load_list_from_json("data/touhou/note_data.json")
+        while True:
+            yield random.choice(data)
     else:
         raise NotImplementedError(f"song name {song_name} not supported")
         cache_dir="cache",
         device="cuda", # "cpu"
         melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation",  "random_select.touhou"
+        lang="zh",
         speaker="resource/singer/singer_embedding_ace-2.npy",
     )
     if config.lang == "zh":
         answer_text = "天气真好\n空气清新\n气温温和\n风和日丽\n天高气爽\n阳光明媚"
     elif config.lang == "jp":
+        answer_text = "流れてく時の中ででもけだるさが"
     else:
         print(f"Currently system does not support {config.lang}")
         exit(1)