import dash from dash import dcc, html, Input, Output, State, callback import dash_bootstrap_components as dbc import google.generativeai as genai import numpy as np import edge_tts import asyncio import io import re import base64 import logging from dash.exceptions import PreventUpdate import pandas as pd import time import os from pydub import AudioSegment # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Initialize Dash app app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP]) # Initialize Gemini AI with environment variable genai.configure(api_key=os.environ.get('GEMINI_API_KEY')) def generate_podcast_script(content, duration, num_hosts): model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25') if num_hosts == 1: prompt = f""" Create a podcast script for one person discussing the following content: {content} The podcast should last approximately {duration}. Include natural speech patterns, humor, and occasional off-topic thoughts. Use occasional speech fillers like um, ah, yes, I see, Ok now. Vary the emotional tone. Format the script as a monologue without speaker labels. Separate each paragraph with a blank line. Do not use any special characters or markdown. Only include the monologue with proper punctuation. Ensure the content flows naturally and stays relevant to the topic. Limit the script length to match the requested duration of {duration}. """ else: prompt = f""" Create a podcast script for two people discussing the following content: {content} The podcast should last approximately {duration}. Include natural speech patterns, humor, and occasional off-topic chit-chat. Use occasional speech fillers like um, ah, yes, I see, Ok now. Vary the emotional tone. Format the script as alternating lines of dialogue without speaker labels. Separate each line with a blank line. Do not use any special characters or markdown. Only include the alternating dialogue lines with proper punctuation. Ensure the conversation flows naturally and stays relevant to the topic. Limit the script length to match the requested duration of {duration}. """ response = model.generate_content(prompt) # Remove any special characters that might be read aloud clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', response.text) return clean_text async def text_to_speech(text, voice): communicate = edge_tts.Communicate(text, voice) audio = io.BytesIO() async for chunk in communicate.stream(): if chunk["type"] == "audio": audio.write(chunk["data"]) audio.seek(0) return audio.read() async def render_podcast(script, voice1, voice2, num_hosts): lines = [line for line in script.split('\n') if line.strip()] audio_segments = [] if num_hosts == 1: for line in lines: audio = await text_to_speech(line, voice1) audio_segments.append(audio) else: for i, line in enumerate(lines): voice = voice1 if i % 2 == 0 else voice2 audio = await text_to_speech(line, voice) audio_segments.append(audio) if not audio_segments: logger.warning("No valid audio segments were generated.") return (24000, np.zeros(24000, dtype=np.int16)) # Return silence if no valid audio was generated # Concatenate audio segments podcast_audio = b''.join(audio_segments) # Convert to numpy array podcast_audio = np.frombuffer(podcast_audio, dtype=np.int16) return (24000, podcast_audio) # edge-tts uses 24000 Hz sample rate async def get_voice_list(): voices = await edge_tts.list_voices() voice_dict = {} for voice in voices: lang = voice["Locale"] if lang not in voice_dict: voice_dict[lang] = [] voice_dict[lang].append(voice["Name"]) return voice_dict # Language names dictionary language_names = { 'af-ZA': 'Afrikaans (South Africa)', 'am-ET': 'Amharic (Ethiopia)', 'ar-AE': 'Arabic (UAE)', 'ar-BH': 'Arabic (Bahrain)', 'ar-DZ': 'Arabic (Algeria)', 'ar-EG': 'Arabic (Egypt)', 'ar-IQ': 'Arabic (Iraq)', 'ar-JO': 'Arabic (Jordan)', 'ar-KW': 'Arabic (Kuwait)', 'ar-LB': 'Arabic (Lebanon)', 'ar-LY': 'Arabic (Libya)', 'ar-MA': 'Arabic (Morocco)', 'ar-OM': 'Arabic (Oman)', 'ar-QA': 'Arabic (Qatar)', 'ar-SA': 'Arabic (Saudi Arabia)', 'ar-SY': 'Arabic (Syria)', 'ar-TN': 'Arabic (Tunisia)', 'ar-YE': 'Arabic (Yemen)', 'az-AZ': 'Azerbaijani (Azerbaijan)', 'bg-BG': 'Bulgarian (Bulgaria)', 'bn-BD': 'Bengali (Bangladesh)', 'bn-IN': 'Bengali (India)', 'bs-BA': 'Bosnian (Bosnia and Herzegovina)', 'ca-ES': 'Catalan (Spain)', 'cs-CZ': 'Czech (Czech Republic)', 'cy-GB': 'Welsh (United Kingdom)', 'da-DK': 'Danish (Denmark)', 'de-AT': 'German (Austria)', 'de-CH': 'German (Switzerland)', 'de-DE': 'German (Germany)', 'el-GR': 'Greek (Greece)', 'en-AU': 'English (Australia)', 'en-CA': 'English (Canada)', 'en-GB': 'English (United Kingdom)', 'en-GH': 'English (Ghana)', 'en-HK': 'English (Hong Kong SAR)', 'en-IE': 'English (Ireland)', 'en-IN': 'English (India)', 'en-KE': 'English (Kenya)', 'en-NG': 'English (Nigeria)', 'en-NZ': 'English (New Zealand)', 'en-PH': 'English (Philippines)', 'en-SG': 'English (Singapore)', 'en-TZ': 'English (Tanzania)', 'en-US': 'English (United States)', 'en-ZA': 'English (South Africa)', 'es-AR': 'Spanish (Argentina)', 'es-BO': 'Spanish (Bolivia)', 'es-CL': 'Spanish (Chile)', 'es-CO': 'Spanish (Colombia)', 'es-CR': 'Spanish (Costa Rica)', 'es-CU': 'Spanish (Cuba)', 'es-DO': 'Spanish (Dominican Republic)', 'es-EC': 'Spanish (Ecuador)', 'es-ES': 'Spanish (Spain)', 'es-GQ': 'Spanish (Equatorial Guinea)', 'es-GT': 'Spanish (Guatemala)', 'es-HN': 'Spanish (Honduras)', 'es-MX': 'Spanish (Mexico)', 'es-NI': 'Spanish (Nicaragua)', 'es-PA': 'Spanish (Panama)', 'es-PE': 'Spanish (Peru)', 'es-PR': 'Spanish (Puerto Rico)', 'es-PY': 'Spanish (Paraguay)', 'es-SV': 'Spanish (El Salvador)', 'es-US': 'Spanish (United States)', 'es-UY': 'Spanish (Uruguay)', 'es-VE': 'Spanish (Venezuela)', 'et-EE': 'Estonian (Estonia)', 'eu-ES': 'Basque (Spain)', 'fa-IR': 'Persian (Iran)', 'fi-FI': 'Finnish (Finland)', 'fil-PH': 'Filipino (Philippines)', 'fr-BE': 'French (Belgium)', 'fr-CA': 'French (Canada)', 'fr-CH': 'French (Switzerland)', 'fr-FR': 'French (France)', 'ga-IE': 'Irish (Ireland)', 'gl-ES': 'Galician (Spain)', 'gu-IN': 'Gujarati (India)', 'he-IL': 'Hebrew (Israel)', 'hi-IN': 'Hindi (India)', 'hr-HR': 'Croatian (Croatia)', 'hu-HU': 'Hungarian (Hungary)', 'hy-AM': 'Armenian (Armenia)', 'id-ID': 'Indonesian (Indonesia)', 'is-IS': 'Icelandic (Iceland)', 'it-IT': 'Italian (Italy)', 'ja-JP': 'Japanese (Japan)', 'jv-ID': 'Javanese (Indonesia)', 'ka-GE': 'Georgian (Georgia)', 'kk-KZ': 'Kazakh (Kazakhstan)', 'km-KH': 'Khmer (Cambodia)', 'kn-IN': 'Kannada (India)', 'ko-KR': 'Korean (Korea)', 'lo-LA': 'Lao (Laos)', 'lt-LT': 'Lithuanian (Lithuania)', 'lv-LV': 'Latvian (Latvia)', 'mk-MK': 'Macedonian (North Macedonia)', 'ml-IN': 'Malayalam (India)', 'mn-MN': 'Mongolian (Mongolia)', 'mr-IN': 'Marathi (India)', 'ms-MY': 'Malay (Malaysia)', 'mt-MT': 'Maltese (Malta)', 'my-MM': 'Burmese (Myanmar)', 'nb-NO': 'Norwegian (Bokmål, Norway)', 'ne-NP': 'Nepali (Nepal)', 'nl-BE': 'Dutch (Belgium)', 'nl-NL': 'Dutch (Netherlands)', 'pl-PL': 'Polish (Poland)', 'ps-AF': 'Pashto (Afghanistan)', 'pt-BR': 'Portuguese (Brazil)', 'pt-PT': 'Portuguese (Portugal)', 'ro-RO': 'Romanian (Romania)', 'ru-RU': 'Russian (Russia)', 'si-LK': 'Sinhala (Sri Lanka)', 'sk-SK': 'Slovak (Slovakia)', 'sl-SI': 'Slovenian (Slovenia)', 'so-SO': 'Somali (Somalia)', 'sq-AL': 'Albanian (Albania)', 'sr-RS': 'Serbian (Serbia)', 'sv-SE': 'Swedish (Sweden)', 'sw-KE': 'Swahili (Kenya)', 'sw-TZ': 'Swahili (Tanzania)', 'ta-IN': 'Tamil (India)', 'ta-LK': 'Tamil (Sri Lanka)', 'ta-MY': 'Tamil (Malaysia)', 'ta-SG': 'Tamil (Singapore)', 'te-IN': 'Telugu (India)', 'th-TH': 'Thai (Thailand)', 'tr-TR': 'Turkish (Turkey)', 'uk-UA': 'Ukrainian (Ukraine)', 'ur-IN': 'Urdu (India)', 'ur-PK': 'Urdu (Pakistan)', 'uz-UZ': 'Uzbek (Uzbekistan)', 'vi-VN': 'Vietnamese (Vietnam)', 'wuu-CN': 'Wu Chinese (China)', 'yue-CN': 'Cantonese (China)', 'zh-CN': 'Chinese (Mandarin, Simplified)', 'zh-HK': 'Chinese (Cantonese, Traditional)', 'zh-TW': 'Chinese (Taiwanese Mandarin)', 'zu-ZA': 'Zulu (South Africa)' } # Get voice list (this should be run once at startup) voice_dict = asyncio.run(get_voice_list()) # Layout app.layout = dbc.Container([ html.H1("AI Podcast Generator", className="my-4"), dbc.Row([ # Left Column (now containing input elements) dbc.Col([ dbc.Card([ dbc.CardBody([ dbc.Textarea(id="content-input", placeholder="Paste your content or upload a document", rows=5, className="my-3"), dcc.Upload( id='document-upload', children=html.Div(['Drag and Drop or ', html.A('Select a File')]), style={ 'width': '100%', 'height': '60px', 'lineHeight': '60px', 'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px', 'textAlign': 'center', 'margin': '10px 0' } ), dbc.RadioItems( id="duration", options=[ {"label": "1-5 min", "value": "1-5 min"}, {"label": "5-10 min", "value": "5-10 min"}, {"label": "10-15 min", "value": "10-15 min"} ], value="1-5 min", inline=True, className="my-3" ), dbc.RadioItems( id="num-hosts", options=[ {"label": "1 host", "value": 1}, {"label": "2 hosts", "value": 2} ], value=2, inline=True, className="my-3" ), dbc.Select( id="lang1-select", options=[{"label": lang, "value": lang} for lang in language_names.values()], value="English (United States)", className="my-2" ), dbc.Select( id="voice1-select", value="en-US-AriaNeural", className="my-2" ), dbc.Select( id="lang2-select", options=[{"label": lang, "value": lang} for lang in language_names.values()], value="English (United States)", className="my-2" ), dbc.Select( id="voice2-select", value="en-US-BrianNeural", className="my-2" ), dbc.Button("Generate Script", id="generate-btn", color="primary", className="mt-3"), ]) ]) ], width=7), # Adjust the width as needed # Right Column (now containing script output and podcast generation) dbc.Col([ dbc.Card([ dbc.CardBody([ dcc.Loading( id="loading-script", type="default", children=[ dbc.Progress(id="script-progress", value=0, className="my-3"), dbc.Textarea(id="script-output", rows=20, className="my-3"), ] ), dbc.Button("Generate Podcast", id="generate-podcast-btn", color="success", className="mt-3"), dcc.Loading( id="loading-podcast", type="default", children=[ dbc.Progress(id="podcast-progress", value=0, className="my-3"), html.Div(id="audio-output", className="my-3"), ] ), dcc.Download(id="download-audio") ]) ]) ], width=5), # Adjust the width as needed ]), ], fluid=True) # Callbacks (continued) @app.callback( Output("voice1-select", "options"), Input("lang1-select", "value") ) def update_voice1_options(lang): if lang == "None": return [] selected_lang = next((key for key, value in language_names.items() if value == lang), None) voices = voice_dict.get(selected_lang, []) return [{"label": v, "value": v} for v in voices] @app.callback( Output("voice2-select", "options"), Input("lang2-select", "value") ) def update_voice2_options(lang): if lang == "None": return [] selected_lang = next((key for key, value in language_names.items() if value == lang), None) voices = voice_dict.get(selected_lang, []) return [{"label": v, "value": v} for v in voices] @app.callback( [Output("script-output", "value"), Output("script-progress", "value")], Input("generate-btn", "n_clicks"), [State("content-input", "value"), State("duration", "value"), State("num-hosts", "value")], prevent_initial_call=True ) def generate_script(n_clicks, content, duration, num_hosts): if n_clicks is None: raise PreventUpdate try: for i in range(10): time.sleep(0.5) # Simulate progress # Instead of yielding, we'll just pass and update at the end pass script = generate_podcast_script(content, duration, num_hosts) return script, 100 except Exception as e: logger.error(f"Error generating script: {str(e)}") return f"Error: {str(e)}", 0 @app.callback( [Output("audio-output", "children"), Output("download-audio", "data"), Output("podcast-progress", "value")], Input("generate-podcast-btn", "n_clicks"), [State("script-output", "value"), State("voice1-select", "value"), State("voice2-select", "value"), State("num-hosts", "value")], prevent_initial_call=True ) def render_and_download_podcast(n_clicks, script, voice1, voice2, num_hosts): if n_clicks is None: raise PreventUpdate try: # Run the async function in a synchronous context sample_rate, audio_data = asyncio.run(render_podcast(script, voice1, voice2, num_hosts)) # Convert numpy array to WAV wav_audio = AudioSegment( audio_data.tobytes(), frame_rate=sample_rate, sample_width=audio_data.dtype.itemsize, channels=1 ) # Convert WAV to MP3 buffer = io.BytesIO() wav_audio.export(buffer, format="mp3") buffer.seek(0) mp3_bytes = buffer.getvalue() # Create base64 audio for playback audio_base64 = base64.b64encode(mp3_bytes).decode('utf-8') audio_src = f"data:audio/mp3;base64,{audio_base64}" return html.Audio(src=audio_src, controls=True), dcc.send_bytes(mp3_bytes, "podcast.mp3"), 100 except Exception as e: logger.error(f"Error rendering podcast: {str(e)}") return html.Div(f"Error: {str(e)}"), None, 0 @app.callback( [Output("lang2-select", "style"), Output("voice2-select", "style")], Input("num-hosts", "value") ) def update_second_voice_visibility(num_hosts): if num_hosts == 2: return {"display": "block"}, {"display": "block"} else: return {"display": "none"}, {"display": "none"} @app.callback( Output("content-input", "value"), Input("document-upload", "contents"), State("document-upload", "filename"), prevent_initial_call=True ) def update_content(contents, filename): if contents is not None: content_type, content_string = contents.split(',') decoded = base64.b64decode(content_string) try: if 'csv' in filename: # Assume that the user uploaded a CSV file df = pd.read_csv(io.StringIO(decoded.decode('utf-8'))) return df.to_string() elif 'xls' in filename: # Assume that the user uploaded an excel file df = pd.read_excel(io.BytesIO(decoded)) return df.to_string() elif 'txt' in filename or 'md' in filename: # Assume that the user uploaded a text or markdown file return decoded.decode('utf-8') else: return 'Unsupported file type. Please upload a CSV, Excel, text, or markdown file.' except Exception as e: logger.error(f"Error processing uploaded file: {str(e)}") return f'There was an error processing this file: {str(e)}' # Run the app if __name__ == '__main__': print("Starting the Dash application...") app.run(debug=True, host='0.0.0.0', port=7860) print("Dash application has finished running.")