bluenevus commited on
Commit
02ebd05
·
verified ·
1 Parent(s): fe225ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -104
app.py CHANGED
@@ -1,7 +1,4 @@
1
  import gradio as gr
2
- import google.generativeai as genai
3
- import numpy as np
4
- import re
5
  import torch
6
  from transformers import AutoModelForCausalLM, AutoTokenizer
7
  from huggingface_hub import snapshot_download, login
@@ -10,9 +7,7 @@ import os
10
  import spaces
11
  import warnings
12
  from snac import SNAC
13
- from dotenv import load_dotenv
14
-
15
- load_dotenv()
16
 
17
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
18
  logger = logging.getLogger(__name__)
@@ -27,6 +22,8 @@ model = None
27
  tokenizer = None
28
  snac_model = None
29
 
 
 
30
  @spaces.GPU()
31
  def load_model():
32
  global model, tokenizer, snac_model
@@ -59,40 +56,6 @@ def load_model():
59
  logger.error(f"Error loading model: {str(e)}")
60
  raise
61
 
62
- @spaces.GPU()
63
- def generate_podcast_script(api_key, content, uploaded_file, duration, num_hosts):
64
- try:
65
- genai.configure(api_key=api_key)
66
- model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
67
-
68
- combined_content = content or ""
69
- if uploaded_file:
70
- file_content = uploaded_file.read().decode('utf-8')
71
- combined_content += "\n" + file_content if combined_content else file_content
72
-
73
- prompt = f"""
74
- Create a podcast script for {'one person' if num_hosts == 1 else 'two people'} discussing:
75
- {combined_content}
76
-
77
- Duration: {duration}. Include natural speech, humor, and occasional off-topic thoughts.
78
- Use speech fillers like um, ah. Vary emotional tone.
79
-
80
- Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
81
- Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
82
-
83
- Use emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
84
-
85
- Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
86
-
87
- Ensure content flows naturally and stays on topic. Match the script length to {duration}.
88
- """
89
-
90
- response = model.generate_content(prompt)
91
- return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
92
- except Exception as e:
93
- logger.error(f"Error generating podcast script: {str(e)}")
94
- raise
95
-
96
  def process_prompt(prompt, voice, tokenizer, device):
97
  prompt = f"{voice}: {prompt}"
98
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
@@ -155,17 +118,15 @@ def redistribute_codes(code_list, snac_model):
155
  return audio_hat.detach().squeeze().cpu().numpy()
156
 
157
  @spaces.GPU()
158
- def text_to_speech(text, voice, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200):
159
- global model, tokenizer, snac_model
160
- if model is None or tokenizer is None or snac_model is None:
161
- load_model()
162
-
163
  if not text.strip():
164
  return None
165
 
166
  try:
 
167
  input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
168
 
 
169
  with torch.no_grad():
170
  generated_ids = model.generate(
171
  input_ids=input_ids,
@@ -179,76 +140,75 @@ def text_to_speech(text, voice, temperature=0.6, top_p=0.95, repetition_penalty=
179
  eos_token_id=128258,
180
  )
181
 
 
182
  code_list = parse_output(generated_ids)
 
 
183
  audio_samples = redistribute_codes(code_list, snac_model)
184
 
185
  return (24000, audio_samples)
186
  except Exception as e:
187
- logger.error(f"Error in text_to_speech: {str(e)}")
188
- raise
189
-
190
- @spaces.GPU()
191
- def render_podcast(api_key, script, voice1, voice2, num_hosts):
192
- try:
193
- lines = [line for line in script.split('\n') if line.strip()]
194
- audio_segments = []
195
-
196
- for i, line in enumerate(lines):
197
- voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
198
- try:
199
- result = text_to_speech(line, voice)
200
- if result is not None:
201
- sample_rate, audio = result
202
- audio_segments.append(audio)
203
- except Exception as e:
204
- logger.error(f"Error processing audio segment: {str(e)}")
205
-
206
- if not audio_segments:
207
- logger.warning("No valid audio segments were generated.")
208
- return (24000, np.zeros(24000, dtype=np.float32))
209
-
210
- podcast_audio = np.concatenate(audio_segments)
211
- podcast_audio = np.clip(podcast_audio, -1, 1)
212
- podcast_audio = (podcast_audio * 32767).astype(np.int16)
213
-
214
- return (24000, podcast_audio)
215
- except Exception as e:
216
- logger.error(f"Error rendering podcast: {str(e)}")
217
- raise
218
 
219
- with gr.Blocks() as demo:
220
- gr.Markdown("# AI Podcast Generator")
 
 
221
 
222
- api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
 
 
 
223
 
224
  with gr.Row():
225
- content_input = gr.Textbox( label="Paste your content (optional)", lines=8 )
226
- document_upload = gr.File(label="Upload Document (optional)")
227
-
228
- duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration", value="1-5 min")
229
- num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
230
-
231
- voice_options = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
232
- voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_options, value="tara")
233
- voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_options, value="leo")
234
-
235
- generate_btn = gr.Button("Generate Script")
236
- script_output = gr.Textbox(label="Generated Script", lines=10)
237
-
238
- render_btn = gr.Button("Render Podcast")
239
- audio_output = gr.Audio(label="Generated Podcast")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
- generate_btn.click(generate_podcast_script,
242
- inputs=[api_key_input, content_input, document_upload, duration, num_hosts],
243
- outputs=script_output)
 
 
244
 
245
- render_btn.click(render_podcast,
246
- inputs=[api_key_input, script_output, voice1_select, voice2_select, num_hosts],
247
- outputs=audio_output)
248
-
249
- num_hosts.change(lambda x: gr.update(visible=x == 2),
250
- inputs=[num_hosts],
251
- outputs=[voice2_select])
252
 
253
  if __name__ == "__main__":
254
  try:
 
1
  import gradio as gr
 
 
 
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  from huggingface_hub import snapshot_download, login
 
7
  import spaces
8
  import warnings
9
  from snac import SNAC
10
+ import numpy as np
 
 
11
 
12
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
13
  logger = logging.getLogger(__name__)
 
22
  tokenizer = None
23
  snac_model = None
24
 
25
+ EMOTIVE_TAGS = ["<laugh>", "<sigh>", "<gasp>", "<cry>", "<yawn>"]
26
+
27
  @spaces.GPU()
28
  def load_model():
29
  global model, tokenizer, snac_model
 
56
  logger.error(f"Error loading model: {str(e)}")
57
  raise
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def process_prompt(prompt, voice, tokenizer, device):
60
  prompt = f"{voice}: {prompt}"
61
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 
118
  return audio_hat.detach().squeeze().cpu().numpy()
119
 
120
  @spaces.GPU()
121
+ def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
 
 
 
 
122
  if not text.strip():
123
  return None
124
 
125
  try:
126
+ progress(0.1, "Processing text...")
127
  input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
128
 
129
+ progress(0.3, "Generating speech tokens...")
130
  with torch.no_grad():
131
  generated_ids = model.generate(
132
  input_ids=input_ids,
 
140
  eos_token_id=128258,
141
  )
142
 
143
+ progress(0.6, "Processing speech tokens...")
144
  code_list = parse_output(generated_ids)
145
+
146
+ progress(0.8, "Converting to audio...")
147
  audio_samples = redistribute_codes(code_list, snac_model)
148
 
149
  return (24000, audio_samples)
150
  except Exception as e:
151
+ print(f"Error generating speech: {e}")
152
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
155
+ gr.Markdown(f"""
156
+ # 🎵 [Orpheus Text-to-Speech](https://github.com/canopyai/Orpheus-TTS)
157
+ Enter your text below and hear it converted to natural-sounding speech with the Orpheus TTS model.
158
 
159
+ ## Tips for better prompts:
160
+ - Add paralinguistic elements like {", ".join(EMOTIVE_TAGS)} or `uhm` for more human-like speech.
161
+ - Longer text prompts generally work better than very short phrases
162
+ """)
163
 
164
  with gr.Row():
165
+ with gr.Column():
166
+ text_input = gr.Textbox(
167
+ label="Text Input",
168
+ placeholder="Enter the text you want to convert to speech...",
169
+ lines=8
170
+ )
171
+ voice_select = gr.Dropdown(
172
+ choices=["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"],
173
+ value="tara",
174
+ label="Voice"
175
+ )
176
+ with gr.Accordion("Advanced Options", open=False):
177
+ temperature = gr.Slider(
178
+ minimum=0.1, maximum=1.0, value=0.6, step=0.1,
179
+ label="Temperature",
180
+ info="Higher values increase randomness in the output"
181
+ )
182
+ top_p = gr.Slider(
183
+ minimum=0.1, maximum=1.0, value=0.95, step=0.05,
184
+ label="Top-p",
185
+ info="Lower values increase determinism in the output"
186
+ )
187
+ repetition_penalty = gr.Slider(
188
+ minimum=1.0, maximum=2.0, value=1.1, step=0.1,
189
+ label="Repetition Penalty",
190
+ info="Higher values discourage repetitive patterns"
191
+ )
192
+ max_new_tokens = gr.Slider(
193
+ minimum=100, maximum=2000, value=1200, step=100,
194
+ label="Max Length",
195
+ info="Maximum length of generated audio (in tokens)"
196
+ )
197
+
198
+ with gr.Row():
199
+ submit_btn = gr.Button("Generate Speech", variant="primary")
200
+ clear_btn = gr.Button("Clear")
201
+
202
+ with gr.Column():
203
+ audio_output = gr.Audio(label="Generated Speech")
204
 
205
+ submit_btn.click(
206
+ generate_speech,
207
+ inputs=[text_input, voice_select, temperature, top_p, repetition_penalty, max_new_tokens],
208
+ outputs=audio_output
209
+ )
210
 
211
+ clear_btn.click(lambda: "", inputs=None, outputs=text_input)
 
 
 
 
 
 
212
 
213
  if __name__ == "__main__":
214
  try: