Spaces:
Paused
Paused
https://huggingface.co/spaces/fffiloni/YuE
#3
by
wowsuffer
- opened
- app.py +17 -32
- inference/infer.py +2 -6
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import subprocess
|
| 3 |
import os
|
| 4 |
-
import re
|
| 5 |
import shutil
|
| 6 |
import tempfile
|
| 7 |
|
|
@@ -67,19 +66,11 @@ def empty_output_folder(output_dir):
|
|
| 67 |
print(f"Error deleting file {file_path}: {e}")
|
| 68 |
|
| 69 |
# Function to create a temporary file with string content
|
| 70 |
-
def create_temp_file(content,
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
temp_file.close()
|
| 76 |
-
|
| 77 |
-
# Debug: Print file contents
|
| 78 |
-
print(f"\nContent written to {prefix}{suffix}:")
|
| 79 |
-
print(content)
|
| 80 |
-
print("---")
|
| 81 |
-
|
| 82 |
-
return temp_file.name
|
| 83 |
|
| 84 |
def get_last_mp3_file(output_dir):
|
| 85 |
# List all files in the output directory
|
|
@@ -103,8 +94,8 @@ def get_last_mp3_file(output_dir):
|
|
| 103 |
|
| 104 |
def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
| 105 |
# Create temporary files
|
| 106 |
-
genre_txt_path = create_temp_file(genre_txt_content,
|
| 107 |
-
lyrics_txt_path = create_temp_file(lyrics_txt_content,
|
| 108 |
|
| 109 |
print(f"Genre TXT path: {genre_txt_path}")
|
| 110 |
print(f"Lyrics TXT path: {lyrics_txt_path}")
|
|
@@ -124,10 +115,11 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
| 124 |
"--genre_txt", f"{genre_txt_path}",
|
| 125 |
"--lyrics_txt", f"{lyrics_txt_path}",
|
| 126 |
"--run_n_segments", str(num_segments),
|
| 127 |
-
"--stage2_batch_size", "
|
| 128 |
"--output_dir", f"{output_dir}",
|
| 129 |
"--cuda_idx", "0",
|
| 130 |
-
"--max_new_tokens", str(max_new_tokens)
|
|
|
|
| 131 |
]
|
| 132 |
|
| 133 |
# Set up environment variables for CUDA with optimized settings
|
|
@@ -155,17 +147,15 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
| 155 |
|
| 156 |
if last_mp3:
|
| 157 |
print("Last .mp3 file:", last_mp3)
|
| 158 |
-
|
| 159 |
-
vocal_mp3_path = "./output/vocoder/stems/vocal.mp3"
|
| 160 |
-
return last_mp3, instrumental_mp3_path, vocal_mp3_path
|
| 161 |
else:
|
| 162 |
-
return None
|
| 163 |
else:
|
| 164 |
print("Output folder is empty.")
|
| 165 |
-
|
| 166 |
except subprocess.CalledProcessError as e:
|
| 167 |
print(f"Error occurred: {e}")
|
| 168 |
-
|
| 169 |
finally:
|
| 170 |
# Clean up temporary files
|
| 171 |
os.remove(genre_txt_path)
|
|
@@ -215,22 +205,17 @@ with gr.Blocks() as demo:
|
|
| 215 |
)
|
| 216 |
lyrics_txt = gr.Textbox(
|
| 217 |
label="Lyrics", lines=12,
|
| 218 |
-
placeholder=""
|
| 219 |
-
Type the lyrics here...
|
| 220 |
-
At least 2 segments, Annotate your segments with brackets, [verse] [chorus] [bridge]""",
|
| 221 |
info="Text containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process."
|
| 222 |
)
|
| 223 |
|
| 224 |
with gr.Column():
|
| 225 |
|
| 226 |
-
num_segments = gr.Number(label="Number of Segments", value=2, interactive=
|
| 227 |
max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="3000", step=500, value=1500, interactive=True)
|
| 228 |
|
| 229 |
submit_btn = gr.Button("Submit")
|
| 230 |
music_out = gr.Audio(label="Audio Result")
|
| 231 |
-
with gr.Accordion("Vocal & Instrumental", open=False):
|
| 232 |
-
instrumental = gr.Audio(label="Intrumental")
|
| 233 |
-
vocal = gr.Audio(label="Vocal")
|
| 234 |
|
| 235 |
gr.Examples(
|
| 236 |
examples = [
|
|
@@ -273,6 +258,6 @@ Living out my dreams with this mic and a deal"""
|
|
| 273 |
submit_btn.click(
|
| 274 |
fn = infer,
|
| 275 |
inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
|
| 276 |
-
outputs = [music_out
|
| 277 |
)
|
| 278 |
demo.queue().launch(show_api=False, show_error=True)
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import subprocess
|
| 3 |
import os
|
|
|
|
| 4 |
import shutil
|
| 5 |
import tempfile
|
| 6 |
|
|
|
|
| 66 |
print(f"Error deleting file {file_path}: {e}")
|
| 67 |
|
| 68 |
# Function to create a temporary file with string content
|
| 69 |
+
def create_temp_file(content, suffix=".txt"):
|
| 70 |
+
fd, path = tempfile.mkstemp(suffix=suffix)
|
| 71 |
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
| 72 |
+
f.write(content)
|
| 73 |
+
return path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
def get_last_mp3_file(output_dir):
|
| 76 |
# List all files in the output directory
|
|
|
|
| 94 |
|
| 95 |
def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
| 96 |
# Create temporary files
|
| 97 |
+
genre_txt_path = create_temp_file(genre_txt_content, ".txt")
|
| 98 |
+
lyrics_txt_path = create_temp_file(lyrics_txt_content, ".txt")
|
| 99 |
|
| 100 |
print(f"Genre TXT path: {genre_txt_path}")
|
| 101 |
print(f"Lyrics TXT path: {lyrics_txt_path}")
|
|
|
|
| 115 |
"--genre_txt", f"{genre_txt_path}",
|
| 116 |
"--lyrics_txt", f"{lyrics_txt_path}",
|
| 117 |
"--run_n_segments", str(num_segments),
|
| 118 |
+
"--stage2_batch_size", "4",
|
| 119 |
"--output_dir", f"{output_dir}",
|
| 120 |
"--cuda_idx", "0",
|
| 121 |
+
"--max_new_tokens", str(max_new_tokens),
|
| 122 |
+
"--disable_offload_model"
|
| 123 |
]
|
| 124 |
|
| 125 |
# Set up environment variables for CUDA with optimized settings
|
|
|
|
| 147 |
|
| 148 |
if last_mp3:
|
| 149 |
print("Last .mp3 file:", last_mp3)
|
| 150 |
+
return last_mp3
|
|
|
|
|
|
|
| 151 |
else:
|
| 152 |
+
return None
|
| 153 |
else:
|
| 154 |
print("Output folder is empty.")
|
| 155 |
+
return None
|
| 156 |
except subprocess.CalledProcessError as e:
|
| 157 |
print(f"Error occurred: {e}")
|
| 158 |
+
return None
|
| 159 |
finally:
|
| 160 |
# Clean up temporary files
|
| 161 |
os.remove(genre_txt_path)
|
|
|
|
| 205 |
)
|
| 206 |
lyrics_txt = gr.Textbox(
|
| 207 |
label="Lyrics", lines=12,
|
| 208 |
+
placeholder="Type the lyrics here...",
|
|
|
|
|
|
|
| 209 |
info="Text containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process."
|
| 210 |
)
|
| 211 |
|
| 212 |
with gr.Column():
|
| 213 |
|
| 214 |
+
num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
|
| 215 |
max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="3000", step=500, value=1500, interactive=True)
|
| 216 |
|
| 217 |
submit_btn = gr.Button("Submit")
|
| 218 |
music_out = gr.Audio(label="Audio Result")
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
gr.Examples(
|
| 221 |
examples = [
|
|
|
|
| 258 |
submit_btn.click(
|
| 259 |
fn = infer,
|
| 260 |
inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
|
| 261 |
+
outputs = [music_out]
|
| 262 |
)
|
| 263 |
demo.queue().launch(show_api=False, show_error=True)
|
inference/infer.py
CHANGED
|
@@ -76,7 +76,7 @@ print(f"Using device: {device}")
|
|
| 76 |
mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
|
| 77 |
model = AutoModelForCausalLM.from_pretrained(
|
| 78 |
stage1_model,
|
| 79 |
-
torch_dtype=torch.
|
| 80 |
attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
|
| 81 |
)
|
| 82 |
model.to(device)
|
|
@@ -120,18 +120,15 @@ stage1_output_set = []
|
|
| 120 |
# Tips:
|
| 121 |
# genre tags support instrumental,genre,mood,vocal timbr and vocal gender
|
| 122 |
# all kinds of tags are needed
|
| 123 |
-
# Ensure files exist
|
| 124 |
with open(args.genre_txt) as f:
|
| 125 |
genres = f.read().strip()
|
| 126 |
-
print(genres)
|
| 127 |
with open(args.lyrics_txt) as f:
|
| 128 |
lyrics = split_lyrics(f.read())
|
| 129 |
-
print(lyrics)
|
| 130 |
# intruction
|
| 131 |
full_lyrics = "\n".join(lyrics)
|
| 132 |
prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
|
| 133 |
prompt_texts += lyrics
|
| 134 |
-
|
| 135 |
|
| 136 |
random_id = uuid.uuid4()
|
| 137 |
output_seq = None
|
|
@@ -144,7 +141,6 @@ start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
|
|
| 144 |
end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
|
| 145 |
# Format text prompt
|
| 146 |
run_n_segments = min(args.run_n_segments+1, len(lyrics))
|
| 147 |
-
print(f"RUN N SEGMENTS: {run_n_segments}")
|
| 148 |
for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
|
| 149 |
section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
|
| 150 |
guidance_scale = 1.5 if i <=1 else 1.2
|
|
|
|
| 76 |
mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
|
| 77 |
model = AutoModelForCausalLM.from_pretrained(
|
| 78 |
stage1_model,
|
| 79 |
+
torch_dtype=torch.bfloat16,
|
| 80 |
attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
|
| 81 |
)
|
| 82 |
model.to(device)
|
|
|
|
| 120 |
# Tips:
|
| 121 |
# genre tags support instrumental,genre,mood,vocal timbr and vocal gender
|
| 122 |
# all kinds of tags are needed
|
|
|
|
| 123 |
with open(args.genre_txt) as f:
|
| 124 |
genres = f.read().strip()
|
|
|
|
| 125 |
with open(args.lyrics_txt) as f:
|
| 126 |
lyrics = split_lyrics(f.read())
|
|
|
|
| 127 |
# intruction
|
| 128 |
full_lyrics = "\n".join(lyrics)
|
| 129 |
prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
|
| 130 |
prompt_texts += lyrics
|
| 131 |
+
|
| 132 |
|
| 133 |
random_id = uuid.uuid4()
|
| 134 |
output_seq = None
|
|
|
|
| 141 |
end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
|
| 142 |
# Format text prompt
|
| 143 |
run_n_segments = min(args.run_n_segments+1, len(lyrics))
|
|
|
|
| 144 |
for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
|
| 145 |
section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
|
| 146 |
guidance_scale = 1.5 if i <=1 else 1.2
|
requirements.txt
CHANGED
|
@@ -3,7 +3,7 @@ torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118
|
|
| 3 |
omegaconf
|
| 4 |
einops
|
| 5 |
numpy<2
|
| 6 |
-
|
| 7 |
sentencepiece
|
| 8 |
tqdm
|
| 9 |
tensorboard
|
|
|
|
| 3 |
omegaconf
|
| 4 |
einops
|
| 5 |
numpy<2
|
| 6 |
+
transformers
|
| 7 |
sentencepiece
|
| 8 |
tqdm
|
| 9 |
tensorboard
|