File size: 8,865 Bytes
74245b5
b8d465b
 
38f82cf
b8d465b
97778aa
 
031a7f3
97778aa
 
 
 
3c8678d
b8d465b
07cb903
b8d465b
 
 
97778aa
 
1668d21
 
 
3c8678d
1668d21
 
 
 
 
3c8678d
 
 
 
1668d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07cb903
3c8678d
d982b19
b8d465b
 
 
d982b19
 
 
 
 
 
 
 
5ba3e1d
d982b19
 
5ba3e1d
 
 
 
d982b19
 
5ba3e1d
 
 
 
 
 
 
 
 
 
07cb903
d982b19
 
 
 
 
 
 
07cb903
d982b19
5ba3e1d
07cb903
 
5ba3e1d
07cb903
5ba3e1d
 
07cb903
d982b19
5ba3e1d
 
 
 
 
 
 
 
 
 
07cb903
d982b19
 
b8d465b
07cb903
b8d465b
 
97778aa
1668d21
07cb903
 
 
97778aa
 
 
07cb903
 
97778aa
 
2148d28
07cb903
 
2148d28
97778aa
07cb903
 
 
 
 
 
1e7b36b
97778aa
72cdd21
b8d465b
97778aa
 
 
 
 
 
 
 
 
 
 
 
 
07cb903
 
 
97778aa
b8d465b
 
97778aa
 
 
 
b8d465b
 
 
 
 
 
 
 
 
 
 
 
 
d982b19
 
ca79387
 
b8d465b
ca79387
b8d465b
 
ca79387
b8d465b
 
 
 
 
 
 
d982b19
 
b8d465b
97778aa
 
b8d465b
d982b19
 
 
 
97778aa
d982b19
97778aa
74245b5
a727789
07cb903
b8d465b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import gradio as gr
import google.generativeai as genai
import numpy as np
import io
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download, login
import torchaudio
from torchaudio.functional import resample
import threading
import queue
import os
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = None
tokenizer = None

def load_model():
    global model, tokenizer
    
    print("Loading Orpheus model...")
    model_name = "canopylabs/orpheus-3b-0.1-ft"

    hf_token = os.environ.get("HUGGINGFACE_TOKEN")
    if not hf_token:
        raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")

    login(token=hf_token)

    snapshot_download(
        repo_id=model_name,
        use_auth_token=hf_token,
        allow_patterns=[
            "config.json",
            "*.safetensors",
            "model.safetensors.index.json",
        ],
        ignore_patterns=[
            "optimizer.pt",
            "pytorch_model.bin",
            "training_args.bin",
            "scheduler.pt",
            "tokenizer.json",
            "tokenizer_config.json",
            "special_tokens_map.json",
            "vocab.json",
            "merges.txt",
            "tokenizer.*"
        ]
    )

    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Orpheus model and tokenizer loaded to {device}")

def generate_podcast_script(api_key, content, duration, num_hosts):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
    
    if num_hosts == 1:
        prompt = f"""
        Create a podcast script for one person discussing the following content:
        {content}
        
        The podcast should last approximately {duration}. Include natural speech patterns,
        humor, and occasional off-topic thoughts. Use occasional speech fillers like um, ah,
        yes, I see, Ok now. Vary the emotional tone.
        
        Format the script as a monologue without speaker labels.
        Separate each paragraph with a blank line.
        
        Only include the monologue with proper punctuation and emotion tags enclosed in angle brackets < >. 
        For example, use <chuckle> instead of "chuckle".
        
        Ensure the content flows naturally and stays relevant to the topic.
        Limit the script length to match the requested duration of {duration}.
        
        To use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments 
        to enhance the dialogue's emotional context.
        
        Always place tags like <laugh> for joy, <sigh> for frustration or relief, <chuckle> for mild amusement, 
        <cough> or <sniffle> for discomfort, <groan> for displeasure, <yawn> for tiredness, and <gasp> for surprise.
        
        For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>. 
        Oh well, at least I finished the project <chuckle>."
        
        Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text
        """
    else:
        prompt = f"""
        Create a podcast script for two people discussing the following content:
        {content}
        
        The podcast should last approximately {duration}. Include natural speech patterns,
        humor, and occasional off-topic chit-chat. Use occasional speech fillers like um, ah,
        yes, I see, Ok now. Vary the emotional tone.
        
        Format the script as alternating lines of dialogue without speaker labels.
        Separate each line with a blank line.
        
        Only include the dialogue with proper punctuation and emotion tags enclosed in angle brackets < >. 
        For example, use <chuckle> instead of "chuckle".
        
        Ensure the conversation flows naturally and stays relevant to the topic.
        Limit the script length to match the requested duration of {duration}.
        
        To use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments 
        to enhance the dialogue's emotional context.
        
        Always place tags like <laugh> for joy, <sigh> for frustration or relief, <chuckle> for mild amusement, 
        <cough> or <sniffle> for discomfort, <groan> for displeasure, <yawn> for tiredness, and <gasp> for surprise.
        
        For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>. 
        Oh well, at least I finished the project <chuckle>."
        
        Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text
        """
    
    response = model.generate_content(prompt)
    clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
    return clean_text

def text_to_speech(text, voice):
    global model, tokenizer
    if tokenizer is None or model is None:
        raise ValueError("Model or tokenizer not initialized. Please call load_model() first.")
    
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=256)
    mel = output[0].cpu().numpy()
    audio = mel_to_audio(mel)
    return audio

def mel_to_audio(mel):
    # Placeholder implementation
    return np.zeros(24000, dtype=np.float32)  # 1 second of silence

def process_audio_segment(line, voice, result_queue):
    try:
        audio = text_to_speech(line, voice)
        result_queue.put(audio)
    except Exception as e:
        logger.error(f"Error processing audio segment: {str(e)}")
        result_queue.put(None)

def render_podcast(api_key, script, voice1, voice2, num_hosts):
    lines = [line for line in script.split('\n') if line.strip()]
    audio_segments = []
    threads = []
    result_queue = queue.Queue()

    for i, line in enumerate(lines):
        voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
        thread = threading.Thread(target=process_audio_segment, args=(line, voice, result_queue))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    while not result_queue.empty():
        segment = result_queue.get()
        if segment is not None:
            audio_segments.append(segment)

    if not audio_segments:
        logger.warning("No valid audio segments were generated.")
        return (24000, np.zeros(24000, dtype=np.float32))

    podcast_audio = np.concatenate(audio_segments)
    return (24000, podcast_audio)

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# AI Podcast Generator")
    
    api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
    
    with gr.Row():
        content_input = gr.Textbox(label="Paste your content or upload a document")
        document_upload = gr.File(label="Upload Document")
    
    duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
    
    num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
    
    voice_options = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
    
    with gr.Row():
        voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_options, value="tara")
    
    with gr.Row():
        voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_options, value="leo")
    
    generate_btn = gr.Button("Generate Script")
    script_output = gr.Textbox(label="Generated Script", lines=10)
    
    render_btn = gr.Button("Render Podcast")
    audio_output = gr.Audio(label="Generated Podcast")
    
    def generate_script_wrapper(api_key, content, duration, num_hosts):
        return generate_podcast_script(api_key, content, duration, num_hosts)
    
    def render_podcast_wrapper(api_key, script, voice1, voice2, num_hosts):
        return render_podcast(api_key, script, voice1, voice2, num_hosts)
    
    generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration, num_hosts], outputs=script_output)
    render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select, num_hosts], outputs=audio_output)

    def update_second_voice_visibility(num_hosts):
        return gr.update(visible=num_hosts == 2)

    num_hosts.change(update_second_voice_visibility, inputs=[num_hosts], outputs=[voice2_select])

if __name__ == "__main__":
    load_model()  # Ensure the model is loaded before launching the interface
    demo.launch()