bluenevus commited on
Commit
07cb903
·
verified ·
1 Parent(s): 2148d28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -46
app.py CHANGED
@@ -11,17 +11,13 @@ from torchaudio.functional import resample
11
  import threading
12
  import queue
13
  import os
14
- from accelerate import init_empty_weights, load_checkpoint_and_dispatch
15
-
16
- # Set up logging
17
  import logging
 
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
21
- # Set up device
22
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
 
24
- # Initialize model and tokenizer
25
  model = None
26
  tokenizer = None
27
 
@@ -31,7 +27,6 @@ def load_model():
31
  print("Loading Orpheus model...")
32
  model_name = "canopylabs/orpheus-3b-0.1-ft"
33
 
34
- # Get Hugging Face token from environment variable
35
  hf_token = os.environ.get("HUGGINGFACE_TOKEN")
36
  if not hf_token:
37
  raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
@@ -63,7 +58,7 @@ def load_model():
63
  model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
64
  model.to(device)
65
  tokenizer = AutoTokenizer.from_pretrained(model_name)
66
- print(f"Orpheus model loaded to {device}")
67
 
68
  def generate_podcast_script(api_key, content, duration, num_hosts):
69
  genai.configure(api_key=api_key)
@@ -96,7 +91,7 @@ def generate_podcast_script(api_key, content, duration, num_hosts):
96
  For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
97
  Oh well, at least I finished the project <chuckle>."
98
 
99
- Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text."
100
  """
101
  else:
102
  prompt = f"""
@@ -104,16 +99,16 @@ def generate_podcast_script(api_key, content, duration, num_hosts):
104
  {content}
105
 
106
  The podcast should last approximately {duration}. Include natural speech patterns,
107
- humor, and occasional off-topic thoughts. Use occasional speech fillers like um, ah,
108
  yes, I see, Ok now. Vary the emotional tone.
109
 
110
- Format the script as a monologue without speaker labels.
111
- Separate each paragraph with a blank line.
112
 
113
- Only include the monologue with proper punctuation and emotion tags enclosed in angle brackets < >.
114
  For example, use <chuckle> instead of "chuckle".
115
 
116
- Ensure the content flows naturally and stays relevant to the topic.
117
  Limit the script length to match the requested duration of {duration}.
118
 
119
  To use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments
@@ -125,51 +120,36 @@ def generate_podcast_script(api_key, content, duration, num_hosts):
125
  For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
126
  Oh well, at least I finished the project <chuckle>."
127
 
128
- Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text."
129
  """
130
 
131
  response = model.generate_content(prompt)
132
- clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', response.text)
133
  return clean_text
134
 
135
  def text_to_speech(text, voice):
136
  global model, tokenizer
 
 
 
137
  inputs = tokenizer(text, return_tensors="pt").to(device)
138
  with torch.no_grad():
139
  output = model.generate(**inputs, max_new_tokens=256)
140
- # Assuming the model outputs mel spectrograms
141
- mel = output[0].cpu().numpy() # Explicitly move to CPU for numpy conversion
142
- # Convert mel spectrogram to audio (you might need to implement this conversion)
143
- audio = mel_to_audio(mel) # This function needs to be implemented
144
  return audio
145
 
146
- def render_podcast(api_key, script, voice1, voice2, num_hosts):
147
- lines = [line for line in script.split('\n') if line.strip()]
148
- audio_segments = []
149
-
150
- for i, line in enumerate(lines):
151
- voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
152
- audio = text_to_speech(line, voice)
153
- audio_segments.append(audio)
154
-
155
- if not audio_segments:
156
- logger.warning("No valid audio segments were generated.")
157
- return (24000, np.zeros(24000, dtype=np.float32))
158
-
159
- podcast_audio = np.concatenate(audio_segments)
160
- return (24000, podcast_audio) # Assuming 24kHz sample rate
161
-
162
- # You'll need to implement this function based on the model's output
163
  def mel_to_audio(mel):
164
- # Convert mel spectrogram to audio
165
- # This will depend on the specific output of your model
166
- # You might need to use a vocoder or other conversion method
167
- # For now, we'll just return a placeholder
168
- return np.zeros(24000, dtype=np.float32) # 1 second of silence as placeholder
169
 
170
  def process_audio_segment(line, voice, result_queue):
171
- audio = text_to_speech(line, voice)
172
- result_queue.put(audio)
 
 
 
 
173
 
174
  def render_podcast(api_key, script, voice1, voice2, num_hosts):
175
  lines = [line for line in script.split('\n') if line.strip()]
@@ -187,15 +167,15 @@ def render_podcast(api_key, script, voice1, voice2, num_hosts):
187
  thread.join()
188
 
189
  while not result_queue.empty():
190
- audio_segments.append(result_queue.get())
 
 
191
 
192
  if not audio_segments:
193
  logger.warning("No valid audio segments were generated.")
194
  return (24000, np.zeros(24000, dtype=np.float32))
195
 
196
  podcast_audio = np.concatenate(audio_segments)
197
- podcast_audio = resample(torch.from_numpy(podcast_audio), 24000, 24000).numpy()
198
-
199
  return (24000, podcast_audio)
200
 
201
  # Gradio Interface
@@ -241,4 +221,5 @@ with gr.Blocks() as demo:
241
  num_hosts.change(update_second_voice_visibility, inputs=[num_hosts], outputs=[voice2_select])
242
 
243
  if __name__ == "__main__":
 
244
  demo.launch()
 
11
  import threading
12
  import queue
13
  import os
 
 
 
14
  import logging
15
+
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
 
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
 
 
21
  model = None
22
  tokenizer = None
23
 
 
27
  print("Loading Orpheus model...")
28
  model_name = "canopylabs/orpheus-3b-0.1-ft"
29
 
 
30
  hf_token = os.environ.get("HUGGINGFACE_TOKEN")
31
  if not hf_token:
32
  raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
 
58
  model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
59
  model.to(device)
60
  tokenizer = AutoTokenizer.from_pretrained(model_name)
61
+ print(f"Orpheus model and tokenizer loaded to {device}")
62
 
63
  def generate_podcast_script(api_key, content, duration, num_hosts):
64
  genai.configure(api_key=api_key)
 
91
  For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
92
  Oh well, at least I finished the project <chuckle>."
93
 
94
+ Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text
95
  """
96
  else:
97
  prompt = f"""
 
99
  {content}
100
 
101
  The podcast should last approximately {duration}. Include natural speech patterns,
102
+ humor, and occasional off-topic chit-chat. Use occasional speech fillers like um, ah,
103
  yes, I see, Ok now. Vary the emotional tone.
104
 
105
+ Format the script as alternating lines of dialogue without speaker labels.
106
+ Separate each line with a blank line.
107
 
108
+ Only include the dialogue with proper punctuation and emotion tags enclosed in angle brackets < >.
109
  For example, use <chuckle> instead of "chuckle".
110
 
111
+ Ensure the conversation flows naturally and stays relevant to the topic.
112
  Limit the script length to match the requested duration of {duration}.
113
 
114
  To use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments
 
120
  For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
121
  Oh well, at least I finished the project <chuckle>."
122
 
123
+ Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text
124
  """
125
 
126
  response = model.generate_content(prompt)
127
+ clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
128
  return clean_text
129
 
130
  def text_to_speech(text, voice):
131
  global model, tokenizer
132
+ if tokenizer is None or model is None:
133
+ raise ValueError("Model or tokenizer not initialized. Please call load_model() first.")
134
+
135
  inputs = tokenizer(text, return_tensors="pt").to(device)
136
  with torch.no_grad():
137
  output = model.generate(**inputs, max_new_tokens=256)
138
+ mel = output[0].cpu().numpy()
139
+ audio = mel_to_audio(mel)
 
 
140
  return audio
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def mel_to_audio(mel):
143
+ # Placeholder implementation
144
+ return np.zeros(24000, dtype=np.float32) # 1 second of silence
 
 
 
145
 
146
  def process_audio_segment(line, voice, result_queue):
147
+ try:
148
+ audio = text_to_speech(line, voice)
149
+ result_queue.put(audio)
150
+ except Exception as e:
151
+ logger.error(f"Error processing audio segment: {str(e)}")
152
+ result_queue.put(None)
153
 
154
  def render_podcast(api_key, script, voice1, voice2, num_hosts):
155
  lines = [line for line in script.split('\n') if line.strip()]
 
167
  thread.join()
168
 
169
  while not result_queue.empty():
170
+ segment = result_queue.get()
171
+ if segment is not None:
172
+ audio_segments.append(segment)
173
 
174
  if not audio_segments:
175
  logger.warning("No valid audio segments were generated.")
176
  return (24000, np.zeros(24000, dtype=np.float32))
177
 
178
  podcast_audio = np.concatenate(audio_segments)
 
 
179
  return (24000, podcast_audio)
180
 
181
  # Gradio Interface
 
221
  num_hosts.change(update_second_voice_visibility, inputs=[num_hosts], outputs=[voice2_select])
222
 
223
  if __name__ == "__main__":
224
+ load_model() # Ensure the model is loaded before launching the interface
225
  demo.launch()