Spaces:

Bils
/

AIPromoStudio

Running on Zero

App Files Files Community

Bils commited on Jan 30

Commit

2169070

verified ·

1 Parent(s): 464b686

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -184

app.py CHANGED Viewed

@@ -25,12 +25,10 @@ MODEL_CONFIG = {
     "llama_models": {
         "Meta-Llama-3-8B": "meta-llama/Meta-Llama-3-8B-Instruct",
         "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
-        "Phi-3-mini": "microsoft/Phi-3-mini-4k-instruct"
     },
     "tts_models": {
         "Standard English": "tts_models/en/ljspeech/tacotron2-DDC",
         "High Quality": "tts_models/en/ljspeech/vits",
-        "Fast Inference": "tts_models/en/sam/tacotron-DDC"
     }
 }
@@ -45,19 +43,17 @@ class ModelManager:
     def get_llama_pipeline(self, model_id, token):
         if model_id not in self.llama_pipelines:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
             model = AutoModelForCausalLM.from_pretrained(
                 model_id,
-                token=token,
                 torch_dtype=torch.float16,
-                device_map="auto",
-                attn_implementation="flash_attention_2"
             )
             self.llama_pipelines[model_id] = pipeline(
                 "text-generation",
                 model=model,
-                tokenizer=tokenizer,
-                device_map="auto"
             )
         return self.llama_pipelines[model_id]
@@ -65,8 +61,6 @@ class ModelManager:
         if model_key not in self.musicgen_models:
             model = MusicgenForConditionalGeneration.from_pretrained(model_key)
             processor = AutoProcessor.from_pretrained(model_key)
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            model.to(device)
             self.musicgen_models[model_key] = (model, processor)
         return self.musicgen_models[model_key]
@@ -80,34 +74,26 @@ model_manager = ModelManager()
 # -------------------------------
 # Core Functions
 # -------------------------------
-@spaces.GPU(duration=120)
-def generate_script(user_prompt, model_id, duration, temperature=0.7, max_tokens=512):
     try:
         text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN)
-        system_prompt = f"""You are an AI audio production assistant. Create content for a {duration}-second promo:
-1. Voice Script: [Clear, engaging narration]
-2. Sound Design: [3-5 specific sound effects]
-3. Music: [Genre, tempo, mood suggestions]
-Keep sections concise and production-ready."""
-        messages = [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt}
-        ]
-        response = text_pipeline(
-            messages,
-            max_new_tokens=max_tokens,
             temperature=temperature,
-            do_sample=True,
-            top_p=0.95,
-            eos_token_id=text_pipeline.tokenizer.eos_token_id
         )
-        return parse_generated_content(response[0]['generated_text'][-1]['content'])
     except Exception as e:
         return f"Error: {str(e)}", "", ""
@@ -136,68 +122,48 @@ def parse_generated_content(text):
     return sections["Voice Script"].strip(), sections["Sound Design"].strip(), sections["Music"].strip()
-@spaces.GPU(duration=100)
 def generate_voice(script, tts_model, speed=1.0):
     try:
         if not script.strip():
-            raise ValueError("Empty script")
         tts = model_manager.get_tts_model(tts_model)
-        output_path = os.path.join(tempfile.gettempdir(), "enhanced_voice.wav")
-        tts.tts_to_file(
-            text=script,
-            file_path=output_path,
-            speed=speed
-        )
         return output_path
     except Exception as e:
         return f"Error: {str(e)}"
-@spaces.GPU(duration=150)
-def generate_music(prompt, duration_sec=30, temperature=1.0, guidance_scale=3.0):
     try:
         model, processor = model_manager.get_musicgen_model()
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        inputs = processor(
-            text=[prompt],
-            padding=True,
-            return_tensors="pt",
-        ).to(device)
-        audio_values = model.generate(
-            **inputs,
-            max_new_tokens=int(duration_sec * 50),
-            temperature=temperature,
-            guidance_scale=guidance_scale,
-            do_sample=True
-        )
-        output_path = os.path.join(tempfile.gettempdir(), "enhanced_music.wav")
-        write(output_path, 32000, audio_values[0, 0].cpu().numpy())
         return output_path
     except Exception as e:
         return f"Error: {str(e)}"
-def blend_audio(voice_path, music_path, ducking=True, duck_level=10, crossfade=500):
     try:
         voice = AudioSegment.from_wav(voice_path)
         music = AudioSegment.from_wav(music_path)
         if len(music) < len(voice):
-            loops = (len(voice) // len(music)) + 1
-            music = music * loops
-        music = music[:len(voice)].fade_out(crossfade)
         if ducking:
-            ducked_music = music - duck_level
-            mixed = ducked_music.overlay(voice.fade_in(crossfade))
-        else:
-            mixed = music.overlay(voice)
-        output_path = os.path.join(tempfile.gettempdir(), "enhanced_mix.wav")
         mixed.export(output_path, format="wav")
         return output_path
     except Exception as e:
@@ -206,132 +172,84 @@ def blend_audio(voice_path, music_path, ducking=True, duck_level=10, crossfade=5
 # -------------------------------
 # Gradio Interface
 # -------------------------------
-theme = gr.themes.Soft(
-    primary_hue="blue",
-    secondary_hue="teal",
-).set(
-    body_text_color_dark='#FFFFFF',
-    background_fill_primary_dark='#1F1F1F'
-)
-with gr.Blocks(theme=theme, title="AI Audio Studio Pro") as demo:
     gr.Markdown("""
-    # 🎙️ AI Audio Studio Pro
-    *Next-generation audio production powered by AI*
     """)
     with gr.Tabs():
-        with gr.Tab("🎯 Concept Development"):
             with gr.Row():
-                with gr.Column(scale=2):
-                    concept_input = gr.Textbox(
-                        label="Your Concept",
-                        placeholder="Describe your audio project...",
-                        lines=3,
-                        max_lines=6
-                    )
-                    with gr.Accordion("Advanced Settings", open=False):
-                        with gr.Row():
-                            model_selector = gr.Dropdown(
-                                choices=list(MODEL_CONFIG["llama_models"].values()),
-                                label="AI Model",
-                                value=MODEL_CONFIG["llama_models"]["Meta-Llama-3-8B"]
-                            )
-                            duration_slider = gr.Slider(15, 120, value=30, step=15, label="Duration (seconds)")
-                        with gr.Row():
-                            temp_slider = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity")
-                            token_slider = gr.Slider(128, 1024, value=512, step=128, label="Max Length")
-                    generate_btn = gr.Button("✨ Generate Concept", variant="primary")
-                with gr.Column(scale=1):
-                    script_output = gr.Textbox(label="Voice Script", interactive=True)
-                    sound_output = gr.Textbox(label="Sound Design", interactive=True)
-                    music_output = gr.Textbox(label="Music Suggestions", interactive=True)
-            generate_btn.click(
-                generate_script,
-                inputs=[concept_input, model_selector, duration_slider, temp_slider, token_slider],
-                outputs=[script_output, sound_output, music_output]
             )
-        with gr.Tab("🗣️ Voice Production"):
-            with gr.Row():
-                with gr.Column():
-                    tts_model = gr.Dropdown(
-                        choices=list(MODEL_CONFIG["tts_models"].values()),
-                        label="Voice Model",
-                        value=MODEL_CONFIG["tts_models"]["Standard English"]
-                    )
-                    speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speaking Rate")
-                    voice_btn = gr.Button("🎙️ Generate Voiceover", variant="primary")
-                with gr.Column():
-                    voice_preview = gr.Audio(label="Preview", interactive=False)
-                    voice_btn.click(
-                        generate_voice,
-                        inputs=[script_output, tts_model, speed_slider],
-                        outputs=voice_preview
-                    )
-        with gr.Tab("🎶 Music Production"):
-            with gr.Row():
-                with gr.Column():
-                    with gr.Accordion("Music Parameters", open=True):
-                        music_duration = gr.Slider(10, 120, value=30, label="Duration (seconds)")
-                        music_temp = gr.Slider(0.1, 2.0, value=1.0, label="Creativity")
-                        guidance_scale = gr.Slider(1.0, 5.0, value=3.0, label="Focus")
-                    music_btn = gr.Button("🎵 Generate Music", variant="primary")
-                with gr.Column():
-                    music_preview = gr.Audio(label="Preview", interactive=False)
-                    music_btn.click(
-                        generate_music,
-                        inputs=[music_output, music_duration, music_temp, guidance_scale],
-                        outputs=music_preview
-                    )
-        with gr.Tab("🔊 Final Mix"):
             with gr.Row():
-                with gr.Column():
-                    ducking_toggle = gr.Checkbox(value=True, label="Enable Voice Ducking")
-                    duck_level = gr.Slider(0, 30, value=12, label="Ducking Strength (dB)")
-                    crossfade_time = gr.Slider(0, 2000, value=500, label="Crossfade (ms)")
-                    mix_btn = gr.Button("🚀 Create Final Mix", variant="primary")
-                with gr.Column():
-                    final_mix = gr.Audio(label="Master Output", interactive=False)
-                    mix_btn.click(
-                        blend_audio,
-                        inputs=[voice_preview, music_preview, ducking_toggle, duck_level, crossfade_time],
-                        outputs=final_mix
-                    )
-    with gr.Accordion("📚 Example Prompts", open=False):
-        gr.Examples(
-            examples=[
-                ["A 30-second tech podcast intro with futuristic sounds"],
-                ["A 15-second radio ad for a coffee shop with morning vibes"],
-                ["A 60-second documentary trailer with epic orchestral music"]
-            ],
-            inputs=concept_input
-        )
-    with gr.Row():
-        gr.Markdown("### System Resources")
-        gpu_status = gr.Textbox(label="GPU Utilization", interactive=False)
-        ram_status = gr.Textbox(label="RAM Usage", interactive=False)
-    # Custom Footer
     gr.Markdown("""
-    <hr>
-    <p style="text-align: center; font-size: 0.9em;">
-        Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
-    </p>
     """)
-    gr.HTML("""
-    <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
-        <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759" />
-    </a>
-    """)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

     "llama_models": {
         "Meta-Llama-3-8B": "meta-llama/Meta-Llama-3-8B-Instruct",
         "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
     },
     "tts_models": {
         "Standard English": "tts_models/en/ljspeech/tacotron2-DDC",
         "High Quality": "tts_models/en/ljspeech/vits",
     }
 }
     def get_llama_pipeline(self, model_id, token):
         if model_id not in self.llama_pipelines:
+            tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
             model = AutoModelForCausalLM.from_pretrained(
                 model_id,
+                use_auth_token=token,
                 torch_dtype=torch.float16,
+                device_map="auto"
             )
             self.llama_pipelines[model_id] = pipeline(
                 "text-generation",
                 model=model,
+                tokenizer=tokenizer
             )
         return self.llama_pipelines[model_id]
         if model_key not in self.musicgen_models:
             model = MusicgenForConditionalGeneration.from_pretrained(model_key)
             processor = AutoProcessor.from_pretrained(model_key)
             self.musicgen_models[model_key] = (model, processor)
         return self.musicgen_models[model_key]
 # -------------------------------
 # Core Functions
 # -------------------------------
+@spaces.GPU
+def generate_script(user_prompt, model_id, duration, temperature=0.7):
     try:
         text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN)
+        prompt = f"""Create a {duration}-second audio promo script with these elements:
+1. Voice Script: [clear narration]
+2. Sound Design: [3-5 effects]
+3. Music: [genre/tempo]
+Concept: {user_prompt}"""
+        result = text_pipeline(
+            prompt,
+            max_new_tokens=300,
             temperature=temperature,
+            do_sample=True
         )
+        return parse_generated_content(result[0]["generated_text"])
     except Exception as e:
         return f"Error: {str(e)}", "", ""
     return sections["Voice Script"].strip(), sections["Sound Design"].strip(), sections["Music"].strip()
+@spaces.GPU
 def generate_voice(script, tts_model, speed=1.0):
     try:
         if not script.strip():
+            return "Error: No script provided"
         tts = model_manager.get_tts_model(tts_model)
+        output_path = os.path.join(tempfile.gettempdir(), "voice.wav")
+        tts.tts_to_file(text=script, file_path=output_path)
         return output_path
     except Exception as e:
         return f"Error: {str(e)}"
+@spaces.GPU
+def generate_music(prompt, duration_sec=30):
     try:
         model, processor = model_manager.get_musicgen_model()
+        inputs = processor(text=[prompt], padding=True, return_tensors="pt")
+        audio_values = model.generate(**inputs, max_new_tokens=int(duration_sec * 50))
+        output_path = os.path.join(tempfile.gettempdir(), "music.wav")
+        write(output_path, 44100, audio_values[0, 0].cpu().numpy())
         return output_path
     except Exception as e:
         return f"Error: {str(e)}"
+def blend_audio(voice_path, music_path, ducking=True, duck_level=10):
     try:
         voice = AudioSegment.from_wav(voice_path)
         music = AudioSegment.from_wav(music_path)
+        # Align durations
         if len(music) < len(voice):
+            music = music * (len(voice) // len(music) + 1)
+        music = music[:len(voice)]
+        # Apply ducking
         if ducking:
+            music = music - duck_level
+        mixed = music.overlay(voice)
+        output_path = os.path.join(tempfile.gettempdir(), "final_mix.wav")
         mixed.export(output_path, format="wav")
         return output_path
     except Exception as e:
 # -------------------------------
 # Gradio Interface
 # -------------------------------
+with gr.Blocks(title="AI Radio Studio", css=".gradio-container {max-width: 800px !important}") as demo:
     gr.Markdown("""
+    # 🎙️ AI Radio Studio
+    *Create professional audio content in 4 easy steps*
     """)
     with gr.Tabs():
+        with gr.Tab("1️⃣ Concept"):
+            concept_input = gr.Textbox(label="Your Idea", placeholder="Describe your radio promo...", lines=3)
             with gr.Row():
+                model_select = gr.Dropdown(
+                    choices=list(MODEL_CONFIG["llama_models"].values()),
+                    label="AI Model",
+                    value="meta-llama/Meta-Llama-3-8B-Instruct"
+                )
+                duration_select = gr.Slider(15, 60, 30, step=15, label="Duration (sec)")
+            generate_btn = gr.Button("Generate Script", variant="primary")
+            script_output = gr.Textbox(label="Voice Script", interactive=True)
+            sound_output = gr.Textbox(label="Sound Effects", interactive=True)
+            music_output = gr.Textbox(label="Music Style", interactive=True)
+        with gr.Tab("2️⃣ Voice"):
+            tts_select = gr.Dropdown(
+                choices=list(MODEL_CONFIG["tts_models"].values()),
+                label="Voice Model",
+                value="tts_models/en/ljspeech/tacotron2-DDC"
             )
+            voice_btn = gr.Button("Generate Voiceover", variant="primary")
+            voice_preview = gr.Audio(label="Preview", type="filepath")
+        with gr.Tab("3️⃣ Music"):
+            music_btn = gr.Button("Generate Music", variant="primary")
+            music_preview = gr.Audio(label="Preview", type="filepath")
+        with gr.Tab("4️⃣ Mix"):
             with gr.Row():
+                ducking_toggle = gr.Checkbox(True, label="Duck Music")
+                duck_level = gr.Slider(0, 20, 10, label="Duck Level (dB)")
+            mix_btn = gr.Button("Create Final Mix", variant="primary")
+            final_mix = gr.Audio(label="Final Output", type="filepath")
+    # Footer Section
     gr.Markdown("""
+    <div style="text-align: center; margin-top: 20px; padding: 15px; border-top: 1px solid #e0e0e0;">
+        <p style="font-size: 0.9em; color: #666;">
+            Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
+        </p>
+        <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/radiogold">
+            <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759"/>
+        </a>
+    </div>
     """)
+    # Event Handlers
+    generate_btn.click(
+        generate_script,
+        inputs=[concept_input, model_select, duration_select],
+        outputs=[script_output, sound_output, music_output]
+    )
+    voice_btn.click(
+        generate_voice,
+        inputs=[script_output, tts_select],
+        outputs=voice_preview
+    )
+    music_btn.click(
+        generate_music,
+        inputs=[music_output],
+        outputs=music_preview
+    )
+    mix_btn.click(
+        blend_audio,
+        inputs=[voice_preview, music_preview, ducking_toggle, duck_level],
+        outputs=final_mix
+    )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)