File size: 17,817 Bytes
84fed9b
 
 
f304938
 
 
 
11fdf2c
f304938
11fdf2c
84fed9b
 
c2d6c29
 
26af193
 
f304938
 
 
 
 
84fed9b
 
 
26af193
 
f304938
26af193
3c6d05f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f304938
 
3c6d05f
 
 
 
 
 
 
f304938
26af193
3c6d05f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f304938
 
3c6d05f
 
 
 
 
 
 
 
84fed9b
3c6d05f
f304938
3c6d05f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f304938
 
3c6d05f
 
 
84fed9b
 
 
 
bc010fb
8ea10e8
bc010fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ea10e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc010fb
84fed9b
e3bea0f
26af193
84fed9b
 
 
 
 
11fdf2c
 
84fed9b
11fdf2c
 
84fed9b
 
 
 
 
 
11fdf2c
 
84fed9b
11fdf2c
 
f3b14e5
84fed9b
c2d6c29
644ec15
 
26af193
644ec15
 
 
 
26af193
644ec15
 
 
 
 
 
 
26af193
644ec15
 
 
 
 
 
 
37f0f07
 
644ec15
26af193
37f0f07
 
644ec15
37f0f07
 
26af193
644ec15
84fed9b
644ec15
 
26af193
644ec15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11fdf2c
644ec15
 
 
 
 
 
 
 
 
 
74245b5
644ec15
 
 
 
 
 
 
 
 
 
3c6d05f
644ec15
 
 
 
 
 
 
 
 
 
 
 
 
3c6d05f
644ec15
 
3c6d05f
11fdf2c
84fed9b
 
 
26af193
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
import dash
from dash import dcc, html, Input, Output, State, callback
import dash_bootstrap_components as dbc
import google.generativeai as genai
import numpy as np
import edge_tts
import asyncio
import io
import re
import base64
import logging
from dash.exceptions import PreventUpdate
import pandas as pd
import time
import os
from pydub import AudioSegment

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Initialize Gemini AI with environment variable
genai.configure(api_key=os.environ.get('GEMINI_API_KEY'))

def generate_podcast_script(content, duration, num_hosts):
    model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
    
    if num_hosts == 1:
        prompt = f"""
        Create a podcast script for one person discussing the following content:
        {content}
        
        The podcast should last approximately {duration}. Include natural speech patterns,
        humor, and occasional off-topic thoughts. Use occasional speech fillers like um, ah,
        yes, I see, Ok now. Vary the emotional tone.
        Format the script as a monologue without speaker labels.
        Separate each paragraph with a blank line.
        Do not use any special characters or markdown. Only include the monologue with proper punctuation.
        Ensure the content flows naturally and stays relevant to the topic.
        Limit the script length to match the requested duration of {duration}.
        """
    else:
        prompt = f"""
        Create a podcast script for two people discussing the following content:
        {content}
        
        The podcast should last approximately {duration}. Include natural speech patterns,
        humor, and occasional off-topic chit-chat. Use occasional speech fillers like um, ah,
        yes, I see, Ok now. Vary the emotional tone.
        Format the script as alternating lines of dialogue without speaker labels.
        Separate each line with a blank line.
        Do not use any special characters or markdown. Only include the alternating dialogue lines with proper punctuation.
        Ensure the conversation flows naturally and stays relevant to the topic.
        Limit the script length to match the requested duration of {duration}.
        """
    
    response = model.generate_content(prompt)
    # Remove any special characters that might be read aloud
    clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', response.text)
    return clean_text

async def text_to_speech(text, voice):
    communicate = edge_tts.Communicate(text, voice)
    audio = io.BytesIO()
    async for chunk in communicate.stream():
        if chunk["type"] == "audio":
            audio.write(chunk["data"])
    audio.seek(0)
    return audio.read()

async def render_podcast(script, voice1, voice2, num_hosts):
    lines = [line for line in script.split('\n') if line.strip()]
    audio_segments = []
    
    if num_hosts == 1:
        for line in lines:
            audio = await text_to_speech(line, voice1)
            audio_segments.append(audio)
    else:
        for i, line in enumerate(lines):
            voice = voice1 if i % 2 == 0 else voice2
            audio = await text_to_speech(line, voice)
            audio_segments.append(audio)
    
    if not audio_segments:
        logger.warning("No valid audio segments were generated.")
        return (24000, np.zeros(24000, dtype=np.int16))  # Return silence if no valid audio was generated
    
    # Concatenate audio segments
    podcast_audio = b''.join(audio_segments)
    
    # Convert to numpy array
    podcast_audio = np.frombuffer(podcast_audio, dtype=np.int16)
    
    return (24000, podcast_audio)  # edge-tts uses 24000 Hz sample rate

async def get_voice_list():
    voices = await edge_tts.list_voices()
    voice_dict = {}
    for voice in voices:
        lang = voice["Locale"]
        if lang not in voice_dict:
            voice_dict[lang] = []
        voice_dict[lang].append(voice["Name"])
    return voice_dict

# Language names dictionary
language_names = {
    'af-ZA': 'Afrikaans (South Africa)', 'am-ET': 'Amharic (Ethiopia)', 'ar-AE': 'Arabic (UAE)', 'ar-BH': 'Arabic (Bahrain)',
    'ar-DZ': 'Arabic (Algeria)', 'ar-EG': 'Arabic (Egypt)', 'ar-IQ': 'Arabic (Iraq)', 'ar-JO': 'Arabic (Jordan)',
    'ar-KW': 'Arabic (Kuwait)', 'ar-LB': 'Arabic (Lebanon)', 'ar-LY': 'Arabic (Libya)', 'ar-MA': 'Arabic (Morocco)',
    'ar-OM': 'Arabic (Oman)', 'ar-QA': 'Arabic (Qatar)', 'ar-SA': 'Arabic (Saudi Arabia)', 'ar-SY': 'Arabic (Syria)',
    'ar-TN': 'Arabic (Tunisia)', 'ar-YE': 'Arabic (Yemen)', 'az-AZ': 'Azerbaijani (Azerbaijan)', 'bg-BG': 'Bulgarian (Bulgaria)',
    'bn-BD': 'Bengali (Bangladesh)', 'bn-IN': 'Bengali (India)', 'bs-BA': 'Bosnian (Bosnia and Herzegovina)', 'ca-ES': 'Catalan (Spain)',
    'cs-CZ': 'Czech (Czech Republic)', 'cy-GB': 'Welsh (United Kingdom)', 'da-DK': 'Danish (Denmark)', 'de-AT': 'German (Austria)',
    'de-CH': 'German (Switzerland)', 'de-DE': 'German (Germany)', 'el-GR': 'Greek (Greece)', 'en-AU': 'English (Australia)',
    'en-CA': 'English (Canada)', 'en-GB': 'English (United Kingdom)', 'en-GH': 'English (Ghana)', 'en-HK': 'English (Hong Kong SAR)',
    'en-IE': 'English (Ireland)', 'en-IN': 'English (India)', 'en-KE': 'English (Kenya)', 'en-NG': 'English (Nigeria)',
    'en-NZ': 'English (New Zealand)', 'en-PH': 'English (Philippines)', 'en-SG': 'English (Singapore)', 'en-TZ': 'English (Tanzania)',
    'en-US': 'English (United States)', 'en-ZA': 'English (South Africa)', 'es-AR': 'Spanish (Argentina)', 'es-BO': 'Spanish (Bolivia)',
    'es-CL': 'Spanish (Chile)', 'es-CO': 'Spanish (Colombia)', 'es-CR': 'Spanish (Costa Rica)', 'es-CU': 'Spanish (Cuba)',
    'es-DO': 'Spanish (Dominican Republic)', 'es-EC': 'Spanish (Ecuador)', 'es-ES': 'Spanish (Spain)', 'es-GQ': 'Spanish (Equatorial Guinea)',
    'es-GT': 'Spanish (Guatemala)', 'es-HN': 'Spanish (Honduras)', 'es-MX': 'Spanish (Mexico)', 'es-NI': 'Spanish (Nicaragua)',
    'es-PA': 'Spanish (Panama)', 'es-PE': 'Spanish (Peru)', 'es-PR': 'Spanish (Puerto Rico)', 'es-PY': 'Spanish (Paraguay)',
    'es-SV': 'Spanish (El Salvador)', 'es-US': 'Spanish (United States)', 'es-UY': 'Spanish (Uruguay)', 'es-VE': 'Spanish (Venezuela)',
    'et-EE': 'Estonian (Estonia)', 'eu-ES': 'Basque (Spain)', 'fa-IR': 'Persian (Iran)', 'fi-FI': 'Finnish (Finland)',
    'fil-PH': 'Filipino (Philippines)', 'fr-BE': 'French (Belgium)', 'fr-CA': 'French (Canada)', 'fr-CH': 'French (Switzerland)',
    'fr-FR': 'French (France)', 'ga-IE': 'Irish (Ireland)', 'gl-ES': 'Galician (Spain)', 'gu-IN': 'Gujarati (India)',
    'he-IL': 'Hebrew (Israel)', 'hi-IN': 'Hindi (India)', 'hr-HR': 'Croatian (Croatia)', 'hu-HU': 'Hungarian (Hungary)',
    'hy-AM': 'Armenian (Armenia)', 'id-ID': 'Indonesian (Indonesia)', 'is-IS': 'Icelandic (Iceland)', 'it-IT': 'Italian (Italy)',
    'ja-JP': 'Japanese (Japan)', 'jv-ID': 'Javanese (Indonesia)', 'ka-GE': 'Georgian (Georgia)', 'kk-KZ': 'Kazakh (Kazakhstan)',
    'km-KH': 'Khmer (Cambodia)', 'kn-IN': 'Kannada (India)', 'ko-KR': 'Korean (Korea)', 'lo-LA': 'Lao (Laos)',
    'lt-LT': 'Lithuanian (Lithuania)', 'lv-LV': 'Latvian (Latvia)', 'mk-MK': 'Macedonian (North Macedonia)', 'ml-IN': 'Malayalam (India)',
    'mn-MN': 'Mongolian (Mongolia)', 'mr-IN': 'Marathi (India)', 'ms-MY': 'Malay (Malaysia)', 'mt-MT': 'Maltese (Malta)',
    'my-MM': 'Burmese (Myanmar)', 'nb-NO': 'Norwegian (Bokmål, Norway)', 'ne-NP': 'Nepali (Nepal)', 'nl-BE': 'Dutch (Belgium)',
    'nl-NL': 'Dutch (Netherlands)', 'pl-PL': 'Polish (Poland)', 'ps-AF': 'Pashto (Afghanistan)', 'pt-BR': 'Portuguese (Brazil)',
    'pt-PT': 'Portuguese (Portugal)', 'ro-RO': 'Romanian (Romania)', 'ru-RU': 'Russian (Russia)', 'si-LK': 'Sinhala (Sri Lanka)',
    'sk-SK': 'Slovak (Slovakia)', 'sl-SI': 'Slovenian (Slovenia)', 'so-SO': 'Somali (Somalia)', 'sq-AL': 'Albanian (Albania)',
    'sr-RS': 'Serbian (Serbia)', 'sv-SE': 'Swedish (Sweden)', 'sw-KE': 'Swahili (Kenya)', 'sw-TZ': 'Swahili (Tanzania)',
    'ta-IN': 'Tamil (India)', 'ta-LK': 'Tamil (Sri Lanka)', 'ta-MY': 'Tamil (Malaysia)', 'ta-SG': 'Tamil (Singapore)',
    'te-IN': 'Telugu (India)', 'th-TH': 'Thai (Thailand)', 'tr-TR': 'Turkish (Turkey)', 'uk-UA': 'Ukrainian (Ukraine)',
    'ur-IN': 'Urdu (India)', 'ur-PK': 'Urdu (Pakistan)', 'uz-UZ': 'Uzbek (Uzbekistan)', 'vi-VN': 'Vietnamese (Vietnam)',
    'wuu-CN': 'Wu Chinese (China)', 'yue-CN': 'Cantonese (China)', 'zh-CN': 'Chinese (Mandarin, Simplified)',
    'zh-HK': 'Chinese (Cantonese, Traditional)', 'zh-TW': 'Chinese (Taiwanese Mandarin)', 'zu-ZA': 'Zulu (South Africa)'
}

# Get voice list (this should be run once at startup)
voice_dict = asyncio.run(get_voice_list())

# Layout
app.layout = dbc.Container([
    html.H1("AI Podcast Generator", className="my-4"),
    
    dbc.Row([
        # Left Column (now containing input elements)
        dbc.Col([
            dbc.Card([
                dbc.CardBody([
                    dbc.Textarea(id="content-input", placeholder="Paste your content or upload a document", rows=5, className="my-3"),
                    dcc.Upload(
                        id='document-upload',
                        children=html.Div(['Drag and Drop or ', html.A('Select a File')]),
                        style={
                            'width': '100%',
                            'height': '60px',
                            'lineHeight': '60px',
                            'borderWidth': '1px',
                            'borderStyle': 'dashed',
                            'borderRadius': '5px',
                            'textAlign': 'center',
                            'margin': '10px 0'
                        }
                    ),
                    dbc.RadioItems(
                        id="duration",
                        options=[
                            {"label": "1-5 min", "value": "1-5 min"},
                            {"label": "5-10 min", "value": "5-10 min"},
                            {"label": "10-15 min", "value": "10-15 min"}
                        ],
                        value="1-5 min",
                        inline=True,
                        className="my-3"
                    ),
                    dbc.RadioItems(
                        id="num-hosts",
                        options=[
                            {"label": "1 host", "value": 1},
                            {"label": "2 hosts", "value": 2}
                        ],
                        value=2,
                        inline=True,
                        className="my-3"
                    ),
                    dbc.Select(
                        id="lang1-select", 
                        options=[{"label": lang, "value": lang} for lang in language_names.values()], 
                        value="English (United States)",
                        className="my-2"
                    ),
                    dbc.Select(
                        id="voice1-select", 
                        value="en-US-AriaNeural",
                        className="my-2"
                    ),
                    dbc.Select(
                        id="lang2-select", 
                        options=[{"label": lang, "value": lang} for lang in language_names.values()], 
                        value="English (United States)",
                        className="my-2"
                    ),
                    dbc.Select(
                        id="voice2-select", 
                        value="en-US-BrianNeural",
                        className="my-2"
                    ),
                    dbc.Button("Generate Script", id="generate-btn", color="primary", className="mt-3"),
                ])
            ])
        ], width=7),  # Adjust the width as needed
        
        # Right Column (now containing script output and podcast generation)
        dbc.Col([
            dbc.Card([
                dbc.CardBody([
                    dcc.Loading(
                        id="loading-script",
                        type="default",
                        children=[
                            dbc.Progress(id="script-progress", value=0, className="my-3"),
                            dbc.Textarea(id="script-output", rows=20, className="my-3"),
                        ]
                    ),
                    dbc.Button("Generate Podcast", id="generate-podcast-btn", color="success", className="mt-3"),
                    dcc.Loading(
                        id="loading-podcast",
                        type="default",
                        children=[
                            dbc.Progress(id="podcast-progress", value=0, className="my-3"),
                            html.Div(id="audio-output", className="my-3"),
                        ]
                    ),
                    dcc.Download(id="download-audio")
                ])
            ])
        ], width=5),  # Adjust the width as needed
    ]),
], fluid=True)

# Callbacks (continued)
@app.callback(
    Output("voice1-select", "options"),
    Input("lang1-select", "value")
)
def update_voice1_options(lang):
    if lang == "None":
        return []
    selected_lang = next((key for key, value in language_names.items() if value == lang), None)
    voices = voice_dict.get(selected_lang, [])
    return [{"label": v, "value": v} for v in voices]

@app.callback(
    Output("voice2-select", "options"),
    Input("lang2-select", "value")
)
def update_voice2_options(lang):
    if lang == "None":
        return []
    selected_lang = next((key for key, value in language_names.items() if value == lang), None)
    voices = voice_dict.get(selected_lang, [])
    return [{"label": v, "value": v} for v in voices]

@app.callback(
    [Output("script-output", "value"),
     Output("script-progress", "value")],
    Input("generate-btn", "n_clicks"),
    [State("content-input", "value"),
     State("duration", "value"),
     State("num-hosts", "value")],
    prevent_initial_call=True
)
def generate_script(n_clicks, content, duration, num_hosts):
    if n_clicks is None:
        raise PreventUpdate
    try:
        for i in range(10):
            time.sleep(0.5)  # Simulate progress
            # Instead of yielding, we'll just pass and update at the end
            pass
        script = generate_podcast_script(content, duration, num_hosts)
        return script, 100
    except Exception as e:
        logger.error(f"Error generating script: {str(e)}")
        return f"Error: {str(e)}", 0

@app.callback(
    [Output("audio-output", "children"),
     Output("download-audio", "data"),
     Output("podcast-progress", "value")],
    Input("generate-podcast-btn", "n_clicks"),
    [State("script-output", "value"),
     State("voice1-select", "value"),
     State("voice2-select", "value"),
     State("num-hosts", "value")],
    prevent_initial_call=True
)
def render_and_download_podcast(n_clicks, script, voice1, voice2, num_hosts):
    if n_clicks is None:
        raise PreventUpdate
    try:
        # Run the async function in a synchronous context
        sample_rate, audio_data = asyncio.run(render_podcast(script, voice1, voice2, num_hosts))
        
        # Convert numpy array to WAV
        wav_audio = AudioSegment(
            audio_data.tobytes(), 
            frame_rate=sample_rate, 
            sample_width=audio_data.dtype.itemsize, 
            channels=1
        )
        
        # Convert WAV to MP3
        buffer = io.BytesIO()
        wav_audio.export(buffer, format="mp3")
        buffer.seek(0)
        mp3_bytes = buffer.getvalue()
        
        # Create base64 audio for playback
        audio_base64 = base64.b64encode(mp3_bytes).decode('utf-8')
        audio_src = f"data:audio/mp3;base64,{audio_base64}"
        
        return html.Audio(src=audio_src, controls=True), dcc.send_bytes(mp3_bytes, "podcast.mp3"), 100
    except Exception as e:
        logger.error(f"Error rendering podcast: {str(e)}")
        return html.Div(f"Error: {str(e)}"), None, 0

@app.callback(
    [Output("lang2-select", "style"),
     Output("voice2-select", "style")],
    Input("num-hosts", "value")
)
def update_second_voice_visibility(num_hosts):
    if num_hosts == 2:
        return {"display": "block"}, {"display": "block"}
    else:
        return {"display": "none"}, {"display": "none"}

@app.callback(
    Output("content-input", "value"),
    Input("document-upload", "contents"),
    State("document-upload", "filename"),
    prevent_initial_call=True
)
def update_content(contents, filename):
    if contents is not None:
        content_type, content_string = contents.split(',')
        decoded = base64.b64decode(content_string)
        try:
            if 'csv' in filename:
                # Assume that the user uploaded a CSV file
                df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
                return df.to_string()
            elif 'xls' in filename:
                # Assume that the user uploaded an excel file
                df = pd.read_excel(io.BytesIO(decoded))
                return df.to_string()
            elif 'txt' in filename or 'md' in filename:
                # Assume that the user uploaded a text or markdown file
                return decoded.decode('utf-8')
            else:
                return 'Unsupported file type. Please upload a CSV, Excel, text, or markdown file.'
        except Exception as e:
            logger.error(f"Error processing uploaded file: {str(e)}")
            return f'There was an error processing this file: {str(e)}'

# Run the app
if __name__ == '__main__':
    print("Starting the Dash application...")
    app.run(debug=True, host='0.0.0.0', port=7860)
    print("Dash application has finished running.")