jhansss commited on
Commit
50dd0bc
·
1 Parent(s): c6ab4e9

Add CLI functionality and restructure configuration files

Browse files
app.py CHANGED
@@ -3,7 +3,8 @@ from interface import GradioInterface
3
 
4
  def main():
5
  demo = GradioInterface(
6
- options_config="config/options.yaml", default_config="config/default.yaml"
 
7
  ).create_interface()
8
  demo.launch()
9
 
 
3
 
4
  def main():
5
  demo = GradioInterface(
6
+ options_config="config/interface/options.yaml",
7
+ default_config="config/interface/default.yaml",
8
  ).create_interface()
9
  demo.launch()
10
 
cli.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from argparse import ArgumentParser
2
+ from logging import getLogger
3
+
4
+ import soundfile as sf
5
+ import yaml
6
+
7
+ from characters import CHARACTERS
8
+ from pipeline import SingingDialoguePipeline
9
+
10
+ logger = getLogger(__name__)
11
+
12
+
13
+ def get_parser():
14
+ parser = ArgumentParser()
15
+ parser.add_argument("--query_audio", type=str, required=True)
16
+ parser.add_argument("--config_path", type=str, required=True)
17
+ parser.add_argument("--output_audio", type=str, required=True)
18
+ return parser
19
+
20
+
21
+ def load_config(config_path: str):
22
+ with open(config_path, "r") as f:
23
+ config = yaml.safe_load(f)
24
+ return config
25
+
26
+
27
+ def main():
28
+ parser = get_parser()
29
+ args = parser.parse_args()
30
+ config = load_config(args.config_path)
31
+ pipeline = SingingDialoguePipeline(config)
32
+ speaker = config["speaker"]
33
+ language = config["language"]
34
+ character_name = config["prompt_template_character"]
35
+ character = CHARACTERS[character_name]
36
+ prompt_template = character.prompt
37
+ results = pipeline.run(args.query_audio, language, prompt_template, speaker)
38
+ logger.info(
39
+ f"Input: {args.query_audio}, Output: {args.output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
40
+ )
41
+ svs_audio, svs_sample_rate = results["svs_audio"]
42
+ sf.write(args.output_audio, svs_audio, svs_sample_rate)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ main()
config/cli/limei_default.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_model: openai/whisper-large-v3-turbo
2
+ llm_model: google/gemma-2-2b
3
+ svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
+ melody_source: sample-lyric-kising
5
+ language: mandarin
6
+ prompt_template_character: Limei
7
+ speaker: 5
8
+ cache_dir: .cache
9
+
10
+ track_latency: True
11
+ evaluators:
12
+ svs:
13
+ - singmos
14
+ - per
15
+ - melody
16
+ - aesthetic
config/cli/yaoyin_default.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_model: openai/whisper-large-v3-turbo
2
+ llm_model: google/gemma-2-2b
3
+ svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
+ melody_source: sample-lyric-kising
5
+ language: mandarin
6
+ prompt_template_character: Yaoyin
7
+ speaker: 9
8
+ cache_dir: .cache
9
+
10
+ track_latency: True
11
+ evaluators:
12
+ svs:
13
+ - singmos
14
+ - per
15
+ - melody
16
+ - aesthetic
config/{default.yaml → interface/default.yaml} RENAMED
File without changes
config/interface/options.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_models:
2
+ - id: openai/whisper-large-v3-turbo
3
+ name: Whisper large-v3-turbo
4
+ - id: openai/whisper-large-v3
5
+ name: Whisper large-v3
6
+ - id: openai/whisper-medium
7
+ name: Whisper medium
8
+ - id: sanchit-gandhi/whisper-small-dv
9
+ name: Whisper small-dv
10
+ - id: facebook/wav2vec2-base-960h
11
+ name: Wav2Vec2-Base-960h
12
+
13
+ llm_models:
14
+ - id: google/gemma-2-2b
15
+ name: Gemma 2 2B
16
+ - id: MiniMaxAI/MiniMax-M1-80k
17
+ name: MiniMax M1 80k
18
+
19
+ svs_models:
20
+ - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
21
+ name: Visinger2 (Bilingual)-zh
22
+ model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
23
+ lang: mandarin
24
+ voices:
25
+ voice1: resource/singer/singer_embedding_ace-2.npy
26
+ voice2: resource/singer/singer_embedding_ace-8.npy
27
+ voice3: resource/singer/singer_embedding_itako.npy
28
+ voice4: resource/singer/singer_embedding_kising_orange.npy
29
+ voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
30
+ - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
31
+ name: Visinger2 (Bilingual)-jp
32
+ model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
33
+ lang: japanese
34
+ voices:
35
+ voice1: resource/singer/singer_embedding_ace-2.npy
36
+ voice2: resource/singer/singer_embedding_ace-8.npy
37
+ voice3: resource/singer/singer_embedding_itako.npy
38
+ voice4: resource/singer/singer_embedding_kising_orange.npy
39
+ voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
40
+ - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
41
+ name: Visinger2 (Chinese)
42
+ model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
43
+ lang: mandarin
44
+ voices:
45
+ voice1: 5
46
+ voice2: 8
47
+ voice3: 12
48
+ voice4: 15
49
+ voice5: 29
50
+
51
+ melody_sources:
52
+ - id: gen-random-none
53
+ name: Random Generation
54
+ desc: "Melody is generated without any structure or reference."
55
+ - id: sample-note-kising
56
+ name: Sampled Melody (KiSing)
57
+ desc: "Melody is retrieved from KiSing dataset."
58
+ - id: sample-note-touhou
59
+ name: Sampled Melody (Touhou)
60
+ desc: "Melody is retrieved from Touhou dataset."
61
+ - id: sample-lyric-kising
62
+ name: Sampled Melody with Lyrics (Kising)
63
+ desc: "Melody with aligned lyrics are sampled from Kising dataset."
requirements.txt CHANGED
@@ -17,3 +17,4 @@ zhconv
17
  git+https://github.com/sea-turt1e/kanjiconv
18
  soundfile
19
  PyYAML
 
 
17
  git+https://github.com/sea-turt1e/kanjiconv
18
  soundfile
19
  PyYAML
20
+ gradio