Spaces:

MicroHealth
/

ai-podcast-builder

Paused

File size: 8,360 Bytes

import gradio as gr
import google.generativeai as genai
import numpy as np
import io
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download, login
import torchaudio
from torchaudio.functional import resample
import threading
import queue

# Set up logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model name
model_name = "canopylabs/orpheus-3b-0.1-ft"

def load_model(hf_token):
    login(token=hf_token)
    
    print("Loading Orpheus model...")
    snapshot_download(
        repo_id=model_name,
        use_auth_token=hf_token,
        allow_patterns=[
            "config.json",
            "*.safetensors",
            "model.safetensors.index.json",
        ],
        ignore_patterns=[
            "optimizer.pt",
            "pytorch_model.bin",
            "training_args.bin",
            "scheduler.pt",
            "tokenizer.json",
            "tokenizer_config.json",
            "special_tokens_map.json",
            "vocab.json",
            "merges.txt",
            "tokenizer.*"
        ]
    )

    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Orpheus model loaded to {device}")
    return model, tokenizer

# Initialize as None, will be loaded when HF token is provided
model = None
tokenizer = None

def generate_podcast_script(api_key, content, duration, num_hosts):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
    
    if num_hosts == 1:
        prompt = f"""
        Create a podcast script for one person discussing the following content:
        {content}
        
        The podcast should last approximately {duration}. Include natural speech patterns,
        humor, and occasional off-topic thoughts. Use occasional speech fillers like um, ah,
        yes, I see, Ok now. Vary the emotional tone.
        Format the script as a monologue without speaker labels.
        Separate each paragraph with a blank line.
        Do not use any special characters or markdown. Only include the monologue with proper punctuation.
        Ensure the content flows naturally and stays relevant to the topic.
        Limit the script length to match the requested duration of {duration}.
        Use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments to enhance the dialogue's emotional context. 
        Place tags like <laugh> for joy, <sigh> for frustration or relief, <chuckle> for mild amusement, <cough> or <sniffle> for discomfort, <groan> for displeasure, <yawn> for tiredness, and <gasp> for surprise. 
        For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>. Oh well, at least I finished the project <chuckle>." 
        Remember, use tags judiciously to maintain a natural flow of conversation.
        """
    else:
        prompt = f"""
        Create a podcast script for two people discussing the following content:
        {content}
        
        The podcast should last approximately {duration}. Include natural speech patterns,
        humor, and occasional off-topic chit-chat. Use occasional speech fillers like um, ah,
        yes, I see, Ok now. Vary the emotional tone.
        Format the script as alternating lines of dialogue without speaker labels.
        Separate each line with a blank line.
        Do not use any special characters or markdown. Only include the alternating dialogue lines with proper punctuation.
        Ensure the conversation flows naturally and stays relevant to the topic.
        Limit the script length to match the requested duration of {duration}.
        Use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments to enhance the dialogue's emotional context. 
        Place tags like <laugh> for joy, <sigh> for frustration or relief, <chuckle> for mild amusement, <cough> or <sniffle> for discomfort, <groan> for displeasure, <yawn> for tiredness, and <gasp> for surprise. 
        For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>. Oh well, at least I finished the project <chuckle>." 
        Remember, use tags judiciously to maintain a natural flow of conversation.
        """
    
    response = model.generate_content(prompt)
    clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', response.text)
    return clean_text

def text_to_speech(text, voice):
    global model, tokenizer
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=256)
    audio = output.audio.cpu().numpy()
    return audio

def process_audio_segment(line, voice, result_queue):
    audio = text_to_speech(line, voice)
    result_queue.put(audio)

def render_podcast(api_key, script, voice1, voice2, num_hosts):
    lines = [line for line in script.split('\n') if line.strip()]
    audio_segments = []
    threads = []
    result_queue = queue.Queue()

    for i, line in enumerate(lines):
        voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
        thread = threading.Thread(target=process_audio_segment, args=(line, voice, result_queue))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    while not result_queue.empty():
        audio_segments.append(result_queue.get())

    if not audio_segments:
        logger.warning("No valid audio segments were generated.")
        return (24000, np.zeros(24000, dtype=np.float32))

    podcast_audio = np.concatenate(audio_segments)
    podcast_audio = resample(torch.from_numpy(podcast_audio), 24000, 24000).numpy()

    return (24000, podcast_audio)

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# AI Podcast Generator")
    
    hf_token_input = gr.Textbox(label="Enter your Hugging Face API Token", type="password")
    load_model_btn = gr.Button("Load Orpheus Model")
    model_status = gr.Markdown("Model not loaded")
    
    api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
    
    with gr.Row():
        content_input = gr.Textbox(label="Paste your content or upload a document")
        document_upload = gr.File(label="Upload Document")
    
    duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
    
    num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
    
    with gr.Row():
        voice1_select = gr.Dropdown(label="Select Voice 1", choices=["Voice 1", "Voice 2", "Voice 3"], value="Voice 1")
    
    with gr.Row():
        voice2_select = gr.Dropdown(label="Select Voice 2", choices=["Voice 1", "Voice 2", "Voice 3"], value="Voice 2")
    
    generate_btn = gr.Button("Generate Script")
    script_output = gr.Textbox(label="Generated Script", lines=10)
    
    render_btn = gr.Button("Render Podcast")
    audio_output = gr.Audio(label="Generated Podcast")
    
    def load_model_wrapper(hf_token):
        global model, tokenizer
        model, tokenizer = load_model(hf_token)
        return "Model loaded successfully"
    
    load_model_btn.click(load_model_wrapper, inputs=[hf_token_input], outputs=[model_status])
    
    def generate_script_wrapper(api_key, content, duration, num_hosts):
        return generate_podcast_script(api_key, content, duration, num_hosts)
    
    def render_podcast_wrapper(api_key, script, voice1, voice2, num_hosts):
        return render_podcast(api_key, script, voice1, voice2, num_hosts)
    
    generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration, num_hosts], outputs=script_output)
    render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select, num_hosts], outputs=audio_output)

    def update_second_voice_visibility(num_hosts):
        return gr.update(visible=num_hosts == 2)

    num_hosts.change(update_second_voice_visibility, inputs=[num_hosts], outputs=[voice2_select])

if __name__ == "__main__":
    demo.launch()