bluenevus commited on
Commit
851995d
·
verified ·
1 Parent(s): 7d92703

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -60
app.py CHANGED
@@ -1,90 +1,69 @@
1
- # app.py
2
-
3
  import gradio as gr
4
- import torch
5
- import torchaudio
6
  import google.generativeai as genai
7
- from e2_tts_pytorch import E2TTS, DurationPredictor
8
  import numpy as np
9
- import os
10
- import requests
11
- from tqdm import tqdm
 
 
 
 
12
 
13
- # (Keep the model loading and initialization code as before)
 
14
 
15
  def generate_podcast_script(api_key, content, duration):
16
  genai.configure(api_key=api_key)
17
  model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
18
 
19
  prompt = f"""
20
- Create a podcast script for two people discussing the following content:
21
  {content}
22
 
23
  The podcast should last approximately {duration}. Include natural speech patterns,
24
  humor, and occasional off-topic chit-chat. Use speech fillers like "um", "ah",
25
- "yes", "I see", "Ok now". Vary the emotional tone (e.g., regular, happy, sad, surprised)
26
- and indicate these in [square brackets]. Format the script as follows:
27
 
28
- Host 1: [emotion] Dialog
29
- Host 2: [emotion] Dialog
30
-
 
 
 
 
31
  Ensure the conversation flows naturally and stays relevant to the topic.
32
  """
33
  response = model.generate_content(prompt)
34
  return response.text
35
 
36
- def text_to_speech(text, speaker_id):
37
- # For simplicity, we'll use a random mel spectrogram as input
38
- # In a real scenario, you'd use the actual mel spectrogram from the cloned voice
39
- mel = torch.randn(1, 80, 100)
40
-
41
- # Generate speech
42
- with torch.no_grad():
43
- sampled = e2tts.sample(mel[:, :5], text=[text])
44
-
45
- audio = sampled.cpu().numpy().squeeze()
46
-
47
- # Check if audio contains any non-zero values
48
- if np.all(audio == 0):
49
- print(f"Warning: Generated audio for '{text}' is all zeros.")
50
- elif np.any(np.isnan(audio)) or np.any(np.isinf(audio)):
51
- print(f"Warning: Generated audio for '{text}' contains NaN or Inf values.")
52
-
53
- # Normalize audio to [-1, 1] range
54
- audio = np.clip(audio, -1, 1)
55
-
56
- return audio
57
-
58
- def create_podcast(api_key, content, duration, voice1, voice2):
59
- script = generate_podcast_script(api_key, content, duration)
60
- return render_podcast(api_key, script, voice1, voice2)
61
 
62
- def gradio_interface(api_key, content, duration, voice1, voice2):
63
- script = generate_podcast_script(api_key, content, duration)
64
- return script
65
-
66
- def render_podcast(api_key, script, voice1, voice2):
67
  lines = script.split('\n')
68
  audio_segments = []
69
 
70
  for line in lines:
71
- if line.startswith("Host 1:") or line.startswith("Host 2:"):
72
- audio = text_to_speech(line[7:], speaker_id=0 if line.startswith("Host 1:") else 1)
73
- if not np.all(audio == 0) and not np.any(np.isnan(audio)) and not np.any(np.isinf(audio)):
74
- audio_segments.append(audio)
 
 
75
 
76
  if not audio_segments:
77
- print("Warning: No valid audio segments were generated.")
78
- return (22050, np.zeros(22050)) # Return silence if no valid audio was generated
79
 
80
  # Concatenate audio segments
81
- podcast_audio = np.concatenate(audio_segments)
82
 
83
- # Ensure audio is in the correct range for int16
84
- podcast_audio = np.clip(podcast_audio, -1, 1) * 32767
85
- podcast_audio = podcast_audio.astype(np.int16)
86
 
87
- return (22050, podcast_audio) # Assuming 22050 Hz sample rate
88
 
89
  # Gradio Interface
90
  with gr.Blocks() as demo:
@@ -99,8 +78,8 @@ with gr.Blocks() as demo:
99
  duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
100
 
101
  with gr.Row():
102
- voice1_upload = gr.Audio(label="Upload Voice 1", type="filepath")
103
- voice2_upload = gr.Audio(label="Upload Voice 2", type="filepath")
104
 
105
  generate_btn = gr.Button("Generate Script")
106
  script_output = gr.Textbox(label="Generated Script", lines=10)
@@ -108,7 +87,13 @@ with gr.Blocks() as demo:
108
  render_btn = gr.Button("Render Podcast")
109
  audio_output = gr.Audio(label="Generated Podcast")
110
 
111
- generate_btn.click(gradio_interface, inputs=[api_key_input, content_input, duration, voice1_upload, voice2_upload], outputs=script_output)
112
- render_btn.click(render_podcast, inputs=[api_key_input, script_output, voice1_upload, voice2_upload], outputs=audio_output)
 
 
 
 
 
 
113
 
114
  demo.launch()
 
 
 
1
  import gradio as gr
 
 
2
  import google.generativeai as genai
 
3
  import numpy as np
4
+ import edge_tts
5
+ import asyncio
6
+
7
+ # Set up logging
8
+ import logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
 
12
+ # Initialize Gemini AI
13
+ genai.configure(api_key='YOUR_GEMINI_API_KEY')
14
 
15
  def generate_podcast_script(api_key, content, duration):
16
  genai.configure(api_key=api_key)
17
  model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
18
 
19
  prompt = f"""
20
+ Create a podcast script for two people (Host 1 and Host 2) discussing the following content:
21
  {content}
22
 
23
  The podcast should last approximately {duration}. Include natural speech patterns,
24
  humor, and occasional off-topic chit-chat. Use speech fillers like "um", "ah",
25
+ "yes", "I see", "Ok now". Vary the emotional tone.
 
26
 
27
+ Format the script as follows, with each line representing a single speaker's dialogue:
28
+ Host 1: Dialog
29
+ Host 2: Dialog
30
+ Host 1: Dialog
31
+ Host 2: Dialog
32
+
33
+ Do not include any other text, markdown, or formatting. Only include the alternating dialogue lines.
34
  Ensure the conversation flows naturally and stays relevant to the topic.
35
  """
36
  response = model.generate_content(prompt)
37
  return response.text
38
 
39
+ async def text_to_speech(text, voice):
40
+ communicate = edge_tts.Communicate(text, voice)
41
+ audio_data = await communicate.to_wav()
42
+ return audio_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ async def render_podcast(api_key, script, voice1, voice2):
 
 
 
 
45
  lines = script.split('\n')
46
  audio_segments = []
47
 
48
  for line in lines:
49
+ if line.startswith("Host 1:"):
50
+ audio = await text_to_speech(line[7:], voice1)
51
+ audio_segments.append(audio)
52
+ elif line.startswith("Host 2:"):
53
+ audio = await text_to_speech(line[7:], voice2)
54
+ audio_segments.append(audio)
55
 
56
  if not audio_segments:
57
+ logger.warning("No valid audio segments were generated.")
58
+ return (24000, np.zeros(24000, dtype=np.int16)) # Return silence if no valid audio was generated
59
 
60
  # Concatenate audio segments
61
+ podcast_audio = b''.join(audio_segments)
62
 
63
+ # Convert to numpy array
64
+ podcast_audio = np.frombuffer(podcast_audio, dtype=np.int16)
 
65
 
66
+ return (24000, podcast_audio) # edge-tts uses 24000 Hz sample rate
67
 
68
  # Gradio Interface
69
  with gr.Blocks() as demo:
 
78
  duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
79
 
80
  with gr.Row():
81
+ voice1_select = gr.Dropdown(label="Select Voice 1", choices=edge_tts.list_voices())
82
+ voice2_select = gr.Dropdown(label="Select Voice 2", choices=edge_tts.list_voices())
83
 
84
  generate_btn = gr.Button("Generate Script")
85
  script_output = gr.Textbox(label="Generated Script", lines=10)
 
87
  render_btn = gr.Button("Render Podcast")
88
  audio_output = gr.Audio(label="Generated Podcast")
89
 
90
+ def generate_script_wrapper(api_key, content, duration):
91
+ return generate_podcast_script(api_key, content, duration)
92
+
93
+ async def render_podcast_wrapper(api_key, script, voice1, voice2):
94
+ return await render_podcast(api_key, script, voice1, voice2)
95
+
96
+ generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration], outputs=script_output)
97
+ render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select], outputs=audio_output)
98
 
99
  demo.launch()