Spaces:

MicroHealth
/

ai-podcast-builder

Paused

App Files Files Community

bluenevus commited on Apr 15

Commit

07cb903

verified ·

1 Parent(s): 2148d28

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -46

app.py CHANGED Viewed

@@ -11,17 +11,13 @@ from torchaudio.functional import resample
 import threading
 import queue
 import os
-from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-# Set up logging
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Set up device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Initialize model and tokenizer
 model = None
 tokenizer = None
@@ -31,7 +27,6 @@ def load_model():
     print("Loading Orpheus model...")
     model_name = "canopylabs/orpheus-3b-0.1-ft"
-    # Get Hugging Face token from environment variable
     hf_token = os.environ.get("HUGGINGFACE_TOKEN")
     if not hf_token:
         raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
@@ -63,7 +58,7 @@ def load_model():
     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
     model.to(device)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    print(f"Orpheus model loaded to {device}")
 def generate_podcast_script(api_key, content, duration, num_hosts):
     genai.configure(api_key=api_key)
@@ -96,7 +91,7 @@ def generate_podcast_script(api_key, content, duration, num_hosts):
         For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
         Oh well, at least I finished the project <chuckle>."
-        Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text."
         """
     else:
         prompt = f"""
@@ -104,16 +99,16 @@ def generate_podcast_script(api_key, content, duration, num_hosts):
         {content}
         The podcast should last approximately {duration}. Include natural speech patterns,
-        humor, and occasional off-topic thoughts. Use occasional speech fillers like um, ah,
         yes, I see, Ok now. Vary the emotional tone.
-        Format the script as a monologue without speaker labels.
-        Separate each paragraph with a blank line.
-        Only include the monologue with proper punctuation and emotion tags enclosed in angle brackets < >.
         For example, use <chuckle> instead of "chuckle".
-        Ensure the content flows naturally and stays relevant to the topic.
         Limit the script length to match the requested duration of {duration}.
         To use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments
@@ -125,51 +120,36 @@ def generate_podcast_script(api_key, content, duration, num_hosts):
         For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
         Oh well, at least I finished the project <chuckle>."
-        Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text."
         """
     response = model.generate_content(prompt)
-    clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', response.text)
     return clean_text
 def text_to_speech(text, voice):
     global model, tokenizer
     inputs = tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
         output = model.generate(**inputs, max_new_tokens=256)
-    # Assuming the model outputs mel spectrograms
-    mel = output[0].cpu().numpy()  # Explicitly move to CPU for numpy conversion
-    # Convert mel spectrogram to audio (you might need to implement this conversion)
-    audio = mel_to_audio(mel)  # This function needs to be implemented
     return audio
-def render_podcast(api_key, script, voice1, voice2, num_hosts):
-    lines = [line for line in script.split('\n') if line.strip()]
-    audio_segments = []
-    for i, line in enumerate(lines):
-        voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
-        audio = text_to_speech(line, voice)
-        audio_segments.append(audio)
-    if not audio_segments:
-        logger.warning("No valid audio segments were generated.")
-        return (24000, np.zeros(24000, dtype=np.float32))
-    podcast_audio = np.concatenate(audio_segments)
-    return (24000, podcast_audio)  # Assuming 24kHz sample rate
-# You'll need to implement this function based on the model's output
 def mel_to_audio(mel):
-    # Convert mel spectrogram to audio
-    # This will depend on the specific output of your model
-    # You might need to use a vocoder or other conversion method
-    # For now, we'll just return a placeholder
-    return np.zeros(24000, dtype=np.float32)  # 1 second of silence as placeholder
 def process_audio_segment(line, voice, result_queue):
-    audio = text_to_speech(line, voice)
-    result_queue.put(audio)
 def render_podcast(api_key, script, voice1, voice2, num_hosts):
     lines = [line for line in script.split('\n') if line.strip()]
@@ -187,15 +167,15 @@ def render_podcast(api_key, script, voice1, voice2, num_hosts):
         thread.join()
     while not result_queue.empty():
-        audio_segments.append(result_queue.get())
     if not audio_segments:
         logger.warning("No valid audio segments were generated.")
         return (24000, np.zeros(24000, dtype=np.float32))
     podcast_audio = np.concatenate(audio_segments)
-    podcast_audio = resample(torch.from_numpy(podcast_audio), 24000, 24000).numpy()
     return (24000, podcast_audio)
 # Gradio Interface
@@ -241,4 +221,5 @@ with gr.Blocks() as demo:
     num_hosts.change(update_second_voice_visibility, inputs=[num_hosts], outputs=[voice2_select])
 if __name__ == "__main__":
     demo.launch()

 import threading
 import queue
 import os
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = None
 tokenizer = None
     print("Loading Orpheus model...")
     model_name = "canopylabs/orpheus-3b-0.1-ft"
     hf_token = os.environ.get("HUGGINGFACE_TOKEN")
     if not hf_token:
         raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
     model.to(device)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
+    print(f"Orpheus model and tokenizer loaded to {device}")
 def generate_podcast_script(api_key, content, duration, num_hosts):
     genai.configure(api_key=api_key)
         For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
         Oh well, at least I finished the project <chuckle>."
+        Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text
         """
     else:
         prompt = f"""
         {content}
         The podcast should last approximately {duration}. Include natural speech patterns,
+        humor, and occasional off-topic chit-chat. Use occasional speech fillers like um, ah,
         yes, I see, Ok now. Vary the emotional tone.
+        Format the script as alternating lines of dialogue without speaker labels.
+        Separate each line with a blank line.
+        Only include the dialogue with proper punctuation and emotion tags enclosed in angle brackets < >.
         For example, use <chuckle> instead of "chuckle".
+        Ensure the conversation flows naturally and stays relevant to the topic.
         Limit the script length to match the requested duration of {duration}.
         To use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments
         For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
         Oh well, at least I finished the project <chuckle>."
+        Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text
         """
     response = model.generate_content(prompt)
+    clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
     return clean_text
 def text_to_speech(text, voice):
     global model, tokenizer
+    if tokenizer is None or model is None:
+        raise ValueError("Model or tokenizer not initialized. Please call load_model() first.")
     inputs = tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
         output = model.generate(**inputs, max_new_tokens=256)
+    mel = output[0].cpu().numpy()
+    audio = mel_to_audio(mel)
     return audio
 def mel_to_audio(mel):
+    # Placeholder implementation
+    return np.zeros(24000, dtype=np.float32)  # 1 second of silence
 def process_audio_segment(line, voice, result_queue):
+    try:
+        audio = text_to_speech(line, voice)
+        result_queue.put(audio)
+    except Exception as e:
+        logger.error(f"Error processing audio segment: {str(e)}")
+        result_queue.put(None)
 def render_podcast(api_key, script, voice1, voice2, num_hosts):
     lines = [line for line in script.split('\n') if line.strip()]
         thread.join()
     while not result_queue.empty():
+        segment = result_queue.get()
+        if segment is not None:
+            audio_segments.append(segment)
     if not audio_segments:
         logger.warning("No valid audio segments were generated.")
         return (24000, np.zeros(24000, dtype=np.float32))
     podcast_audio = np.concatenate(audio_segments)
     return (24000, podcast_audio)
 # Gradio Interface
     num_hosts.change(update_second_voice_visibility, inputs=[num_hosts], outputs=[voice2_select])
 if __name__ == "__main__":
+    load_model()  # Ensure the model is loaded before launching the interface
     demo.launch()