Spaces:

MicroHealth
/

ai-podcast-builder

Paused

App Files Files Community

bluenevus commited on Apr 12

Commit

851995d

verified ·

1 Parent(s): 7d92703

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -60

app.py CHANGED Viewed

@@ -1,90 +1,69 @@
-# app.py
 import gradio as gr
-import torch
-import torchaudio
 import google.generativeai as genai
-from e2_tts_pytorch import E2TTS, DurationPredictor
 import numpy as np
-import os
-import requests
-from tqdm import tqdm
-# (Keep the model loading and initialization code as before)
 def generate_podcast_script(api_key, content, duration):
     genai.configure(api_key=api_key)
     model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
     prompt = f"""
-    Create a podcast script for two people discussing the following content:
     {content}
     The podcast should last approximately {duration}. Include natural speech patterns,
     humor, and occasional off-topic chit-chat. Use speech fillers like "um", "ah",
-    "yes", "I see", "Ok now". Vary the emotional tone (e.g., regular, happy, sad, surprised)
-    and indicate these in [square brackets]. Format the script as follows:
-    Host 1: [emotion] Dialog
-    Host 2: [emotion] Dialog
     Ensure the conversation flows naturally and stays relevant to the topic.
     """
     response = model.generate_content(prompt)
     return response.text
-def text_to_speech(text, speaker_id):
-    # For simplicity, we'll use a random mel spectrogram as input
-    # In a real scenario, you'd use the actual mel spectrogram from the cloned voice
-    mel = torch.randn(1, 80, 100)
-    # Generate speech
-    with torch.no_grad():
-        sampled = e2tts.sample(mel[:, :5], text=[text])
-    audio = sampled.cpu().numpy().squeeze()
-    # Check if audio contains any non-zero values
-    if np.all(audio == 0):
-        print(f"Warning: Generated audio for '{text}' is all zeros.")
-    elif np.any(np.isnan(audio)) or np.any(np.isinf(audio)):
-        print(f"Warning: Generated audio for '{text}' contains NaN or Inf values.")
-    # Normalize audio to [-1, 1] range
-    audio = np.clip(audio, -1, 1)
-    return audio
-def create_podcast(api_key, content, duration, voice1, voice2):
-    script = generate_podcast_script(api_key, content, duration)
-    return render_podcast(api_key, script, voice1, voice2)
-def gradio_interface(api_key, content, duration, voice1, voice2):
-    script = generate_podcast_script(api_key, content, duration)
-    return script
-def render_podcast(api_key, script, voice1, voice2):
     lines = script.split('\n')
     audio_segments = []
     for line in lines:
-        if line.startswith("Host 1:") or line.startswith("Host 2:"):
-            audio = text_to_speech(line[7:], speaker_id=0 if line.startswith("Host 1:") else 1)
-            if not np.all(audio == 0) and not np.any(np.isnan(audio)) and not np.any(np.isinf(audio)):
-                audio_segments.append(audio)
     if not audio_segments:
-        print("Warning: No valid audio segments were generated.")
-        return (22050, np.zeros(22050))  # Return silence if no valid audio was generated
     # Concatenate audio segments
-    podcast_audio = np.concatenate(audio_segments)
-    # Ensure audio is in the correct range for int16
-    podcast_audio = np.clip(podcast_audio, -1, 1) * 32767
-    podcast_audio = podcast_audio.astype(np.int16)
-    return (22050, podcast_audio)  # Assuming 22050 Hz sample rate
 # Gradio Interface
 with gr.Blocks() as demo:
@@ -99,8 +78,8 @@ with gr.Blocks() as demo:
     duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
     with gr.Row():
-        voice1_upload = gr.Audio(label="Upload Voice 1", type="filepath")
-        voice2_upload = gr.Audio(label="Upload Voice 2", type="filepath")
     generate_btn = gr.Button("Generate Script")
     script_output = gr.Textbox(label="Generated Script", lines=10)
@@ -108,7 +87,13 @@ with gr.Blocks() as demo:
     render_btn = gr.Button("Render Podcast")
     audio_output = gr.Audio(label="Generated Podcast")
-    generate_btn.click(gradio_interface, inputs=[api_key_input, content_input, duration, voice1_upload, voice2_upload], outputs=script_output)
-    render_btn.click(render_podcast, inputs=[api_key_input, script_output, voice1_upload, voice2_upload], outputs=audio_output)
 demo.launch()

 import gradio as gr
 import google.generativeai as genai
 import numpy as np
+import edge_tts
+import asyncio
+# Set up logging
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize Gemini AI
+genai.configure(api_key='YOUR_GEMINI_API_KEY')
 def generate_podcast_script(api_key, content, duration):
     genai.configure(api_key=api_key)
     model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
     prompt = f"""
+    Create a podcast script for two people (Host 1 and Host 2) discussing the following content:
     {content}
     The podcast should last approximately {duration}. Include natural speech patterns,
     humor, and occasional off-topic chit-chat. Use speech fillers like "um", "ah",
+    "yes", "I see", "Ok now". Vary the emotional tone.
+    Format the script as follows, with each line representing a single speaker's dialogue:
+    Host 1: Dialog
+    Host 2: Dialog
+    Host 1: Dialog
+    Host 2: Dialog
+    Do not include any other text, markdown, or formatting. Only include the alternating dialogue lines.
     Ensure the conversation flows naturally and stays relevant to the topic.
     """
     response = model.generate_content(prompt)
     return response.text
+async def text_to_speech(text, voice):
+    communicate = edge_tts.Communicate(text, voice)
+    audio_data = await communicate.to_wav()
+    return audio_data
+async def render_podcast(api_key, script, voice1, voice2):
     lines = script.split('\n')
     audio_segments = []
     for line in lines:
+        if line.startswith("Host 1:"):
+            audio = await text_to_speech(line[7:], voice1)
+            audio_segments.append(audio)
+        elif line.startswith("Host 2:"):
+            audio = await text_to_speech(line[7:], voice2)
+            audio_segments.append(audio)
     if not audio_segments:
+        logger.warning("No valid audio segments were generated.")
+        return (24000, np.zeros(24000, dtype=np.int16))  # Return silence if no valid audio was generated
     # Concatenate audio segments
+    podcast_audio = b''.join(audio_segments)
+    # Convert to numpy array
+    podcast_audio = np.frombuffer(podcast_audio, dtype=np.int16)
+    return (24000, podcast_audio)  # edge-tts uses 24000 Hz sample rate
 # Gradio Interface
 with gr.Blocks() as demo:
     duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
     with gr.Row():
+        voice1_select = gr.Dropdown(label="Select Voice 1", choices=edge_tts.list_voices())
+        voice2_select = gr.Dropdown(label="Select Voice 2", choices=edge_tts.list_voices())
     generate_btn = gr.Button("Generate Script")
     script_output = gr.Textbox(label="Generated Script", lines=10)
     render_btn = gr.Button("Render Podcast")
     audio_output = gr.Audio(label="Generated Podcast")
+    def generate_script_wrapper(api_key, content, duration):
+        return generate_podcast_script(api_key, content, duration)
+    async def render_podcast_wrapper(api_key, script, voice1, voice2):
+        return await render_podcast(api_key, script, voice1, voice2)
+    generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration], outputs=script_output)
+    render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select], outputs=audio_output)
 demo.launch()