herimor commited on
Commit
03e05c4
·
1 Parent(s): e961174
Files changed (4) hide show
  1. README.md +15 -4
  2. app.py +126 -0
  3. configs/generator.json +49 -0
  4. requirements.txt +3 -0
README.md CHANGED
@@ -1,14 +1,25 @@
1
  ---
2
- title: Voxtream
3
- emoji: 🐠
4
  colorFrom: green
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.47.1
 
8
  app_file: app.py
9
- pinned: false
10
  license: cc-by-4.0
11
  short_description: Generate speech from text and an audio prompt
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: VoXtream
3
+ emoji:
4
  colorFrom: green
5
  colorTo: yellow
6
  sdk: gradio
7
+ python_version: 3.11
8
+ sdk_version: 4.44.1
9
  app_file: app.py
10
+ pinned: true
11
  license: cc-by-4.0
12
  short_description: Generate speech from text and an audio prompt
13
+ models:
14
+ - herimor/voxtream
15
+ tags:
16
+ - text-to-speech
17
+ - speech-synthesis
18
+ preload_from_hub:
19
+ - herimor/voxtream model.safetensors,config.json,phoneme_to_token.json
20
+ - kyutai/moshiko-pytorch-bf16 tokenizer-e351c8d8-checkpoint125.safetensors
21
+ - charsiu/en_w2v2_fc_10ms pytorch_model.bin,config.json
22
+ - charsiu/tokenizer_en_cmu special_tokens_map.json,tokenizer_config.json,vocab.json
23
  ---
24
 
25
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import nltk
5
+ import torch
6
+ import spaces
7
+ import gradio as gr
8
+ import numpy as np
9
+
10
+ from voxtream.generator import SpeechGenerator, SpeechGeneratorConfig
11
+
12
+ with open("configs/generator.json") as f:
13
+ config = SpeechGeneratorConfig(**json.load(f))
14
+
15
+ # Loading speaker encoder
16
+ torch.hub.load(
17
+ config.spk_enc_repo,
18
+ config.spk_enc_model,
19
+ model_name=config.spk_enc_model_name,
20
+ train_type=config.spk_enc_train_type,
21
+ dataset=config.spk_enc_dataset,
22
+ trust_repo=True,
23
+ verbose=False,
24
+ )
25
+ # Loading NLTK packages
26
+ nltk.download("averaged_perceptron_tagger_eng", quiet=True, raise_on_error=True)
27
+ nltk.download("punkt", quiet=True, raise_on_error=True)
28
+
29
+ # Initialize speech generator
30
+ speech_generator = SpeechGenerator(config)
31
+
32
+ CUSTOM_CSS = """
33
+ /* overall width */
34
+ .gradio-container {max-width: 1100px !important}
35
+ /* stack labels tighter and even heights */
36
+ #cols .wrap > .form {gap: 10px}
37
+ #left-col, #right-col {gap: 14px}
38
+ /* make submit centered + bigger */
39
+ #submit {width: 260px; margin: 10px auto 0 auto;}
40
+ /* make clear align left and look secondary */
41
+ #clear {width: 120px;}
42
+ /* give audio a little breathing room */
43
+ audio {outline: none;}
44
+ """
45
+
46
+
47
+ @spaces.GPU
48
+ def synthesize_fn(prompt_audio_path, prompt_text, target_text):
49
+ if speech_generator.model.device == "cpu":
50
+ speech_generator.model.to("cuda")
51
+ speech_generator.mimi.to("cuda")
52
+ speech_generator.spk_enc.to("cuda")
53
+ speech_generator.aligner.to("cuda")
54
+
55
+ if not prompt_audio_path or not target_text:
56
+ return None
57
+ stream = speech_generator.generate_stream(
58
+ prompt_text=prompt_text,
59
+ prompt_audio_path=Path(prompt_audio_path),
60
+ text=target_text,
61
+ )
62
+ frames = [frame for frame, _ in stream]
63
+ if not frames:
64
+ return None
65
+ waveform = np.concatenate(frames).astype(np.float32)
66
+
67
+ # Fade out
68
+ fade_len_sec = 0.1
69
+ fade_out = np.linspace(1.0, 0.0, int(config.mimi_sr * fade_len_sec))
70
+ waveform[-int(config.mimi_sr * fade_len_sec) :] *= fade_out
71
+
72
+ return (config.mimi_sr, waveform)
73
+
74
+
75
+ def main():
76
+ with gr.Blocks(css=CUSTOM_CSS, title="VoXtream") as demo:
77
+ gr.Markdown("# VoXtream TTS demo")
78
+
79
+ with gr.Row(equal_height=True, elem_id="cols"):
80
+ with gr.Column(scale=1, elem_id="left-col"):
81
+ prompt_audio = gr.Audio(
82
+ sources=["microphone", "upload"],
83
+ type="filepath",
84
+ label="Prompt audio (3-5 sec of target voice)",
85
+ )
86
+ prompt_text = gr.Textbox(
87
+ lines=3,
88
+ label="Prompt transcript",
89
+ placeholder="Text that matches the prompt audio (Required)",
90
+ )
91
+
92
+ with gr.Column(scale=1, elem_id="right-col"):
93
+ target_text = gr.Textbox(
94
+ lines=3,
95
+ label="Target text",
96
+ placeholder="What you want the model to say",
97
+ )
98
+ output_audio = gr.Audio(
99
+ type="numpy",
100
+ label="Synthesized audio",
101
+ interactive=False,
102
+ )
103
+
104
+ with gr.Row():
105
+ clear_btn = gr.Button("Clear", elem_id="clear", variant="secondary")
106
+ submit_btn = gr.Button("Submit", elem_id="submit", variant="primary")
107
+
108
+ # wire up actions
109
+ submit_btn.click(
110
+ fn=synthesize_fn,
111
+ inputs=[prompt_audio, prompt_text, target_text],
112
+ outputs=output_audio,
113
+ )
114
+
115
+ # reset everything
116
+ clear_btn.click(
117
+ fn=lambda: (None, "", "", None),
118
+ inputs=[],
119
+ outputs=[prompt_audio, prompt_text, target_text, output_audio],
120
+ )
121
+
122
+ demo.launch()
123
+
124
+
125
+ if __name__ == "__main__":
126
+ main()
configs/generator.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sil_token": 69,
3
+ "bos_token": 71,
4
+ "eos_token": 72,
5
+ "end_pad": 5,
6
+ "num_codebooks": 12,
7
+ "num_phones_per_frame": 2,
8
+ "audio_delay_frames": 1,
9
+ "temperature": 0.9,
10
+ "topk": 5,
11
+ "max_audio_length_ms": 60000,
12
+ "device": "cpu",
13
+ "model_repo": "herimor/voxtream",
14
+ "model_name": "model.safetensors",
15
+ "model_config_name": "config.json",
16
+ "mimi_sr": 24000,
17
+ "mimi_vocab_size": 2048,
18
+ "mimi_frame_ms": 80,
19
+ "mimi_repo": "kyutai/moshiko-pytorch-bf16",
20
+ "mimi_name": "tokenizer-e351c8d8-checkpoint125.safetensors",
21
+ "spk_enc_sr": 16000,
22
+ "spk_enc_repo": "IDRnD/ReDimNet",
23
+ "spk_enc_model": "ReDimNet",
24
+ "spk_enc_model_name": "M",
25
+ "spk_enc_train_type": "ft_mix",
26
+ "spk_enc_dataset": "vb2+vox2+cnc",
27
+ "phoneme_dict_name": "phoneme_to_token.json",
28
+ "nltk_resource": "taggers/averaged_perceptron_tagger_eng",
29
+ "aligner": "charsiu/en_w2v2_fc_10ms",
30
+ "cache_prompt": false,
31
+ "phoneme_index_map": {
32
+ "0": [
33
+ 0,
34
+ 1
35
+ ],
36
+ "1": [
37
+ 0,
38
+ 2
39
+ ],
40
+ "2": [
41
+ 1,
42
+ 1
43
+ ],
44
+ "3": [
45
+ 1,
46
+ 2
47
+ ]
48
+ }
49
+ }
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ voxtream
2
+ gradio_client==1.3.0
3
+ pydantic==2.10.6