Spaces:

fffiloni
/

YuE

Paused

App Files Files Community

https://huggingface.co/spaces/fffiloni/YuE

by wowsuffer - opened Jan 29

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+20

-39

Files changed (3) hide show

app.py +17 -32
inference/infer.py +2 -6
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import subprocess
 import os
-import re
 import shutil
 import tempfile
@@ -67,19 +66,11 @@ def empty_output_folder(output_dir):
             print(f"Error deleting file {file_path}: {e}")
 # Function to create a temporary file with string content
-def create_temp_file(content, prefix, suffix=".txt"):
-    temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=prefix, suffix=suffix)
-    content = content.strip() + "\n\n"  # Add extra newline at end
-    content = content.replace("\r\n", "\n").replace("\r", "\n")
-    temp_file.write(content)
-    temp_file.close()
-    # Debug: Print file contents
-    print(f"\nContent written to {prefix}{suffix}:")
-    print(content)
-    print("---")
-    return temp_file.name
 def get_last_mp3_file(output_dir):
     # List all files in the output directory
@@ -103,8 +94,8 @@ def get_last_mp3_file(output_dir):
 def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
     # Create temporary files
-    genre_txt_path = create_temp_file(genre_txt_content, prefix="genre_")
-    lyrics_txt_path = create_temp_file(lyrics_txt_content, prefix="lyrics_")
     print(f"Genre TXT path: {genre_txt_path}")
     print(f"Lyrics TXT path: {lyrics_txt_path}")
@@ -124,10 +115,11 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
         "--genre_txt", f"{genre_txt_path}",
         "--lyrics_txt", f"{lyrics_txt_path}",
         "--run_n_segments", str(num_segments),
-        "--stage2_batch_size", "16",
         "--output_dir", f"{output_dir}",
         "--cuda_idx", "0",
-        "--max_new_tokens", str(max_new_tokens)
     ]
     # Set up environment variables for CUDA with optimized settings
@@ -155,17 +147,15 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
             if last_mp3:
                 print("Last .mp3 file:", last_mp3)
-                instrumental_mp3_path = "./output/vocoder/stems/instrumental.mp3"
-                vocal_mp3_path = "./output/vocoder/stems/vocal.mp3"
-                return last_mp3, instrumental_mp3_path, vocal_mp3_path
             else:
-                return None, None, None
         else:
             print("Output folder is empty.")
-            raise gr.Error(f"Error occurred: Output folder is empty.")
     except subprocess.CalledProcessError as e:
         print(f"Error occurred: {e}")
-        raise gr.Error(f"Error occurred: {e}")
     finally:
         # Clean up temporary files
         os.remove(genre_txt_path)
@@ -215,22 +205,17 @@ with gr.Blocks() as demo:
                 )
                 lyrics_txt = gr.Textbox(
                     label="Lyrics", lines=12,
-                    placeholder="""
-Type the lyrics here...
-At least 2 segments, Annotate your segments with brackets, [verse] [chorus] [bridge]""",
                     info="Text containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process."
                 )
             with gr.Column():
-                num_segments = gr.Number(label="Number of Segments", value=2, interactive=False)
                 max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="3000", step=500, value=1500, interactive=True)
                 submit_btn = gr.Button("Submit")
                 music_out = gr.Audio(label="Audio Result")
-                with gr.Accordion("Vocal & Instrumental", open=False):
-                    instrumental = gr.Audio(label="Intrumental")
-                    vocal = gr.Audio(label="Vocal")
         gr.Examples(
             examples = [
@@ -273,6 +258,6 @@ Living out my dreams with this mic and a deal"""
     submit_btn.click(
         fn = infer,
         inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
-        outputs = [music_out, instrumental, vocal]
     )
 demo.queue().launch(show_api=False, show_error=True)

 import gradio as gr
 import subprocess
 import os
 import shutil
 import tempfile
             print(f"Error deleting file {file_path}: {e}")
 # Function to create a temporary file with string content
+def create_temp_file(content, suffix=".txt"):
+    fd, path = tempfile.mkstemp(suffix=suffix)
+    with os.fdopen(fd, "w", encoding="utf-8") as f:
+        f.write(content)
+    return path
 def get_last_mp3_file(output_dir):
     # List all files in the output directory
 def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
     # Create temporary files
+    genre_txt_path = create_temp_file(genre_txt_content, ".txt")
+    lyrics_txt_path = create_temp_file(lyrics_txt_content, ".txt")
     print(f"Genre TXT path: {genre_txt_path}")
     print(f"Lyrics TXT path: {lyrics_txt_path}")
         "--genre_txt", f"{genre_txt_path}",
         "--lyrics_txt", f"{lyrics_txt_path}",
         "--run_n_segments", str(num_segments),
+        "--stage2_batch_size", "4",
         "--output_dir", f"{output_dir}",
         "--cuda_idx", "0",
+        "--max_new_tokens", str(max_new_tokens),
+        "--disable_offload_model"
     ]
     # Set up environment variables for CUDA with optimized settings
             if last_mp3:
                 print("Last .mp3 file:", last_mp3)
+                return last_mp3
             else:
+                return None
         else:
             print("Output folder is empty.")
+            return None
     except subprocess.CalledProcessError as e:
         print(f"Error occurred: {e}")
+        return None
     finally:
         # Clean up temporary files
         os.remove(genre_txt_path)
                 )
                 lyrics_txt = gr.Textbox(
                     label="Lyrics", lines=12,
+                    placeholder="Type the lyrics here...",
                     info="Text containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process."
                 )
             with gr.Column():
+                num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
                 max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="3000", step=500, value=1500, interactive=True)
                 submit_btn = gr.Button("Submit")
                 music_out = gr.Audio(label="Audio Result")
         gr.Examples(
             examples = [
     submit_btn.click(
         fn = infer,
         inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
+        outputs = [music_out]
     )
 demo.queue().launch(show_api=False, show_error=True)

inference/infer.py CHANGED Viewed

@@ -76,7 +76,7 @@ print(f"Using device: {device}")
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 model = AutoModelForCausalLM.from_pretrained(
     stage1_model,
-    torch_dtype=torch.float16,
     attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
     )
 model.to(device)
@@ -120,18 +120,15 @@ stage1_output_set = []
 # Tips:
 # genre tags support instrumental，genre，mood，vocal timbr and vocal gender
 # all kinds of tags are needed
-# Ensure files exist
 with open(args.genre_txt) as f:
     genres = f.read().strip()
-    print(genres)
 with open(args.lyrics_txt) as f:
     lyrics = split_lyrics(f.read())
-    print(lyrics)
 # intruction
 full_lyrics = "\n".join(lyrics)
 prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
 prompt_texts += lyrics
-print(prompt_texts)
 random_id = uuid.uuid4()
 output_seq = None
@@ -144,7 +141,6 @@ start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
 end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
 # Format text prompt
 run_n_segments = min(args.run_n_segments+1, len(lyrics))
-print(f"RUN N SEGMENTS: {run_n_segments}")
 for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
     section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
     guidance_scale = 1.5 if i <=1 else 1.2

 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 model = AutoModelForCausalLM.from_pretrained(
     stage1_model,
+    torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
     )
 model.to(device)
 # Tips:
 # genre tags support instrumental，genre，mood，vocal timbr and vocal gender
 # all kinds of tags are needed
 with open(args.genre_txt) as f:
     genres = f.read().strip()
 with open(args.lyrics_txt) as f:
     lyrics = split_lyrics(f.read())
 # intruction
 full_lyrics = "\n".join(lyrics)
 prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
 prompt_texts += lyrics
 random_id = uuid.uuid4()
 output_seq = None
 end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
 # Format text prompt
 run_n_segments = min(args.run_n_segments+1, len(lyrics))
 for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
     section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
     guidance_scale = 1.5 if i <=1 else 1.2

requirements.txt CHANGED Viewed

@@ -3,7 +3,7 @@ torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118
 omegaconf
 einops
 numpy<2
-git+https://github.com/KingNish24/transformers.git@yue-patch
 sentencepiece
 tqdm
 tensorboard

 omegaconf
 einops
 numpy<2
+transformers
 sentencepiece
 tqdm
 tensorboard