File size: 7,723 Bytes
74245b5
b8d465b
 
 
97778aa
aa10e55
97778aa
031a7f3
b8d465b
48ade8d
896ca54
310ebea
07cb903
5753bc2
 
b8d465b
 
310ebea
aa10e55
 
310ebea
54be90e
 
21f3e87
54be90e
97778aa
54be90e
5753bc2
b66d512
 
 
 
896ca54
3c8678d
1668d21
 
5753bc2
1668d21
 
3c8678d
 
 
 
5753bc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8d465b
896ca54
97778aa
1668d21
5753bc2
 
 
 
aa10e55
 
 
 
5753bc2
 
aa10e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5753bc2
 
 
97778aa
896ca54
97778aa
5753bc2
 
 
 
 
 
 
aa10e55
5753bc2
 
 
 
 
 
 
 
 
310ebea
 
 
 
 
5753bc2
 
 
 
b8d465b
 
 
 
 
5753bc2
 
 
 
 
b8d465b
d982b19
 
ca79387
21f3e87
 
b8d465b
 
 
 
 
 
 
21f3e87
5753bc2
21f3e87
b8d465b
21f3e87
 
 
d982b19
21f3e87
 
 
74245b5
a727789
5753bc2
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import gradio as gr
import google.generativeai as genai
import numpy as np
import re
import torch
import torchaudio
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download, login
import logging
import os
import spaces
import warnings

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    return torch.device("cpu")

device = get_device()
logger.info(f"Using device: {device}")

model = None
tokenizer = None

@spaces.GPU()
def load_model():
    global model, tokenizer
    
    logger.info("Loading Orpheus model...")
    model_name = "canopylabs/orpheus-3b-0.1-ft"

    hf_token = os.environ.get("HUGGINGFACE_TOKEN")
    if not hf_token:
        raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")

    try:
        login(token=hf_token)

        snapshot_download(
            repo_id=model_name,
            use_auth_token=hf_token,
            allow_patterns=[
                "config.json",
                "*.safetensors",
                "model.safetensors.index.json",
            ],
            ignore_patterns=[
                "optimizer.pt",
                "pytorch_model.bin",
                "training_args.bin",
                "scheduler.pt",
                "tokenizer.json",
                "tokenizer_config.json",
                "special_tokens_map.json",
                "vocab.json",
                "merges.txt",
                "tokenizer.*"
            ]
        )

        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32 if device.type == 'cpu' else torch.bfloat16)
        model.to(device)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        logger.info(f"Orpheus model and tokenizer loaded to {device}")
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise

def generate_podcast_script(api_key, content, uploaded_file, duration, num_hosts):
    try:
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
        
        combined_content = content or ""
        if uploaded_file:
            file_content = uploaded_file.read().decode('utf-8')
            combined_content += "\n" + file_content if combined_content else file_content
        
        prompt = f"""
        Create a podcast script for {'one person' if num_hosts == 1 else 'two people'} discussing:
        {combined_content}
        
        Duration: {duration}. Include natural speech, humor, and occasional off-topic thoughts.
        Use speech fillers like um, ah. Vary emotional tone.
        
        Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
        Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
        
        Use emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
        
        Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
        
        Ensure content flows naturally and stays on topic. Match the script length to {duration}.
        """
        
        response = model.generate_content(prompt)
        return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
    except Exception as e:
        logger.error(f"Error generating podcast script: {str(e)}")
        raise

@spaces.GPU()
def text_to_speech(text, voice):
    global model, tokenizer
    try:
        if model is None or tokenizer is None:
            load_model()
        
        # Remove emotion tags for TTS processing
        clean_text = re.sub(r'<[^>]+>', '', text)
        
        inputs = tokenizer(clean_text, return_tensors="pt").to(device)
        with torch.no_grad():
            output = model.generate(**inputs, max_new_tokens=256)
        
        # Convert output tensor to mel spectrogram
        mel = output[0].cpu()
        
        # Normalize the mel spectrogram
        mel = (mel - mel.min()) / (mel.max() - mel.min())
        
        # Convert mel spectrogram to audio using torchaudio
        griffin_lim = torchaudio.transforms.GriffinLim(n_fft=2048, n_iter=10)
        audio = griffin_lim(mel.unsqueeze(0))
        
        # Convert to numpy array and ensure it's in the correct format
        audio_np = audio.squeeze().numpy()
        audio_np = np.clip(audio_np, -1, 1)
        
        return (24000, audio_np.astype(np.float32))  # Assuming 24kHz sample rate
    except Exception as e:
        logger.error(f"Error in text_to_speech: {str(e)}")
        raise

@spaces.GPU()
def render_podcast(api_key, script, voice1, voice2, num_hosts):
    try:
        lines = [line for line in script.split('\n') if line.strip()]
        audio_segments = []

        for i, line in enumerate(lines):
            voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
            try:
                _, audio = text_to_speech(line, voice)
                audio_segments.append(audio)
            except Exception as e:
                logger.error(f"Error processing audio segment: {str(e)}")

        if not audio_segments:
            logger.warning("No valid audio segments were generated.")
            return (24000, np.zeros(24000, dtype=np.float32))

        podcast_audio = np.concatenate(audio_segments)
        
        # Ensure the audio is in the correct format for Gradio
        podcast_audio = np.clip(podcast_audio, -1, 1)
        podcast_audio = (podcast_audio * 32767).astype(np.int16)
        
        return (24000, podcast_audio)
    except Exception as e:
        logger.error(f"Error rendering podcast: {str(e)}")
        raise

with gr.Blocks() as demo:
    gr.Markdown("# AI Podcast Generator")
    
    api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
    
    with gr.Row():
        content_input = gr.Textbox(label="Paste your content (optional)")
        document_upload = gr.File(label="Upload Document (optional)")
    
    duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
    num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
    
    voice_options = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
    voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_options, value="tara")
    voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_options, value="leo")
    
    generate_btn = gr.Button("Generate Script")
    script_output = gr.Textbox(label="Generated Script", lines=10)
    
    render_btn = gr.Button("Render Podcast")
    audio_output = gr.Audio(label="Generated Podcast")
    
    generate_btn.click(generate_podcast_script, 
                       inputs=[api_key_input, content_input, document_upload, duration, num_hosts], 
                       outputs=script_output)
    
    render_btn.click(render_podcast, 
                     inputs=[api_key_input, script_output, voice1_select, voice2_select, num_hosts], 
                     outputs=audio_output)

    num_hosts.change(lambda x: gr.update(visible=x == 2), 
                     inputs=[num_hosts], 
                     outputs=[voice2_select])

if __name__ == "__main__":
    try:
        load_model()
        demo.launch()
    except Exception as e:
        logger.error(f"Error launching the application: {str(e)}")