Spaces:
Sleeping
Sleeping
Add integrating random_select to server.py
Browse files
server.py
CHANGED
@@ -14,7 +14,7 @@ import soundfile as sf
|
|
14 |
from pypinyin import lazy_pinyin
|
15 |
import jiwer
|
16 |
import librosa
|
17 |
-
from svs_utils import singmos_warmup, singmos_evaluation
|
18 |
|
19 |
app = FastAPI()
|
20 |
|
@@ -33,11 +33,12 @@ SYSTEM_PROMPT = """
|
|
33 |
- 啊呀,那是我未曾涉足的奇技,恕我無法詳答
|
34 |
- 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了
|
35 |
請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。
|
36 |
-
|
37 |
有人曾這樣對麗梅說話——{}
|
38 |
麗梅的回答——
|
39 |
"""
|
40 |
|
|
|
41 |
config = argparse.Namespace(
|
42 |
model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
43 |
cache_dir="cache",
|
@@ -51,6 +52,9 @@ svs_model = svs_warmup(config)
|
|
51 |
predictor, _ = singmos_warmup()
|
52 |
sample_rate = 44100
|
53 |
|
|
|
|
|
|
|
54 |
def remove_non_chinese_japanese(text):
|
55 |
pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+'
|
56 |
cleaned = re.sub(pattern, '', text)
|
@@ -69,6 +73,24 @@ def remove_punctuation_and_replace_with_space(text):
|
|
69 |
return text
|
70 |
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
@app.post("/process_audio")
|
73 |
async def process_audio(file: UploadFile = File(...)):
|
74 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
@@ -78,7 +100,8 @@ async def process_audio(file: UploadFile = File(...)):
|
|
78 |
# load audio
|
79 |
y = librosa.load(tmp_path, sr=16000)[0]
|
80 |
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
81 |
-
|
|
|
82 |
output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
|
83 |
output = output.split("麗梅的回答——")[1]
|
84 |
output = remove_punctuation_and_replace_with_space(output)
|
@@ -89,6 +112,7 @@ async def process_audio(file: UploadFile = File(...)):
|
|
89 |
output,
|
90 |
svs_model,
|
91 |
config,
|
|
|
92 |
)
|
93 |
sf.write("tmp/response.wav", wav_info, samplerate=44100)
|
94 |
|
@@ -132,7 +156,7 @@ def test_audio():
|
|
132 |
# load audio
|
133 |
y = librosa.load("nihao.mp3", sr=16000)[0]
|
134 |
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
135 |
-
prompt = SYSTEM_PROMPT + asr_result
|
136 |
output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
|
137 |
output = output.split("麗梅的回答——")[1]
|
138 |
output = remove_punctuation_and_replace_with_space(output)
|
@@ -140,12 +164,9 @@ def test_audio():
|
|
140 |
f.write(output)
|
141 |
|
142 |
wav_info = svs_inference(
|
143 |
-
config.model_path,
|
144 |
-
svs_model,
|
145 |
output,
|
146 |
-
|
147 |
-
|
148 |
-
fs=44100
|
149 |
)
|
150 |
sf.write("tmp/response.wav", wav_info, samplerate=44100)
|
151 |
with open("tmp/response.wav", "rb") as f:
|
|
|
14 |
from pypinyin import lazy_pinyin
|
15 |
import jiwer
|
16 |
import librosa
|
17 |
+
from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
|
18 |
|
19 |
app = FastAPI()
|
20 |
|
|
|
33 |
- 啊呀,那是我未曾涉足的奇技,恕我無法詳答
|
34 |
- 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了
|
35 |
請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。
|
36 |
+
{}
|
37 |
有人曾這樣對麗梅說話——{}
|
38 |
麗梅的回答——
|
39 |
"""
|
40 |
|
41 |
+
|
42 |
config = argparse.Namespace(
|
43 |
model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
44 |
cache_dir="cache",
|
|
|
52 |
predictor, _ = singmos_warmup()
|
53 |
sample_rate = 44100
|
54 |
|
55 |
+
# load dataset for random_select
|
56 |
+
song2note_lengths, song_db = load_song_database(config)
|
57 |
+
|
58 |
def remove_non_chinese_japanese(text):
|
59 |
pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+'
|
60 |
cleaned = re.sub(pattern, '', text)
|
|
|
73 |
return text
|
74 |
|
75 |
|
76 |
+
def get_lyric_format_prompts_and_metadata(config):
|
77 |
+
if config.melody_source.startswith("random_generate"):
|
78 |
+
return "", {}
|
79 |
+
elif config.melody_source.startswith("random_select"):
|
80 |
+
# get song_name and phrase_length
|
81 |
+
global song2note_lengths
|
82 |
+
phrase_length, metadata = estimate_sentence_length(
|
83 |
+
None, config, song2note_lengths
|
84 |
+
)
|
85 |
+
LYRIC_FORMAT_PROMPT = "".join(
|
86 |
+
["\n请按照歌词格式回答我的问题,每句需遵循以下字数规则:"]
|
87 |
+
+ [f"\n第{i}句:{c}个字" for i, c in enumerate(phrase_length, 1)]
|
88 |
+
) + "\n"
|
89 |
+
return LYRIC_FORMAT_PROMPT, metadata
|
90 |
+
else:
|
91 |
+
raise ValueError(f"Unsupported melody_source: {config.melody_source}. Unable to get lyric format prompts.")
|
92 |
+
|
93 |
+
|
94 |
@app.post("/process_audio")
|
95 |
async def process_audio(file: UploadFile = File(...)):
|
96 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
|
|
100 |
# load audio
|
101 |
y = librosa.load(tmp_path, sr=16000)[0]
|
102 |
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
103 |
+
additional_prompt, additional_inference_args = get_lyric_format_prompts_and_metadata(config)
|
104 |
+
prompt = SYSTEM_PROMPT.format(additional_prompt, asr_result)
|
105 |
output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
|
106 |
output = output.split("麗梅的回答——")[1]
|
107 |
output = remove_punctuation_and_replace_with_space(output)
|
|
|
112 |
output,
|
113 |
svs_model,
|
114 |
config,
|
115 |
+
**additional_inference_args,
|
116 |
)
|
117 |
sf.write("tmp/response.wav", wav_info, samplerate=44100)
|
118 |
|
|
|
156 |
# load audio
|
157 |
y = librosa.load("nihao.mp3", sr=16000)[0]
|
158 |
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
159 |
+
prompt = SYSTEM_PROMPT + asr_result # TODO: how to add additional prompt to SYSTEM_PROMPT here???
|
160 |
output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
|
161 |
output = output.split("麗梅的回答——")[1]
|
162 |
output = remove_punctuation_and_replace_with_space(output)
|
|
|
164 |
f.write(output)
|
165 |
|
166 |
wav_info = svs_inference(
|
|
|
|
|
167 |
output,
|
168 |
+
svs_model,
|
169 |
+
config,
|
|
|
170 |
)
|
171 |
sf.write("tmp/response.wav", wav_info, samplerate=44100)
|
172 |
with open("tmp/response.wav", "rb") as f:
|