bluenevus commited on
Commit
84fed9b
·
verified ·
1 Parent(s): 2fa76ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -184
app.py CHANGED
@@ -1,211 +1,187 @@
1
- import gradio as gr
 
 
 
 
2
  import google.generativeai as genai
3
  import numpy as np
4
  import edge_tts
5
  import asyncio
6
- import io
7
  import re
 
 
 
8
 
9
  # Set up logging
10
- import logging
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
 
 
 
14
  # Initialize Gemini AI
15
  genai.configure(api_key='YOUR_GEMINI_API_KEY')
16
 
 
17
  def generate_podcast_script(api_key, content, duration, num_hosts):
18
- genai.configure(api_key=api_key)
19
- model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
20
-
21
- if num_hosts == 1:
22
- prompt = f"""
23
- Create a podcast script for one person discussing the following content:
24
- {content}
25
-
26
- The podcast should last approximately {duration}. Include natural speech patterns,
27
- humor, and occasional off-topic thoughts. Use occasional speech fillers like um, ah,
28
- yes, I see, Ok now. Vary the emotional tone.
29
- Format the script as a monologue without speaker labels.
30
- Separate each paragraph with a blank line.
31
- Do not use any special characters or markdown. Only include the monologue with proper punctuation.
32
- Ensure the content flows naturally and stays relevant to the topic.
33
- Limit the script length to match the requested duration of {duration}.
34
- """
35
- else:
36
- prompt = f"""
37
- Create a podcast script for two people discussing the following content:
38
- {content}
39
-
40
- The podcast should last approximately {duration}. Include natural speech patterns,
41
- humor, and occasional off-topic chit-chat. Use occasional speech fillers like um, ah,
42
- yes, I see, Ok now. Vary the emotional tone.
43
- Format the script as alternating lines of dialogue without speaker labels.
44
- Separate each line with a blank line.
45
- Do not use any special characters or markdown. Only include the alternating dialogue lines with proper punctuation.
46
- Ensure the conversation flows naturally and stays relevant to the topic.
47
- Limit the script length to match the requested duration of {duration}.
48
- """
49
-
50
- response = model.generate_content(prompt)
51
- # Remove any special characters that might be read aloud
52
- clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', response.text)
53
- return clean_text
54
 
55
  async def text_to_speech(text, voice):
56
- communicate = edge_tts.Communicate(text, voice)
57
- audio = io.BytesIO()
58
- async for chunk in communicate.stream():
59
- if chunk["type"] == "audio":
60
- audio.write(chunk["data"])
61
- audio.seek(0)
62
- return audio.read()
63
 
64
  async def render_podcast(api_key, script, voice1, voice2, num_hosts):
65
- lines = [line for line in script.split('\n') if line.strip()]
66
- audio_segments = []
67
-
68
- if num_hosts == 1:
69
- for line in lines:
70
- audio = await text_to_speech(line, voice1)
71
- audio_segments.append(audio)
72
- else:
73
- for i, line in enumerate(lines):
74
- voice = voice1 if i % 2 == 0 else voice2
75
- audio = await text_to_speech(line, voice)
76
- audio_segments.append(audio)
77
-
78
- if not audio_segments:
79
- logger.warning("No valid audio segments were generated.")
80
- return (24000, np.zeros(24000, dtype=np.int16)) # Return silence if no valid audio was generated
81
-
82
- # Concatenate audio segments
83
- podcast_audio = b''.join(audio_segments)
84
-
85
- # Convert to numpy array
86
- podcast_audio = np.frombuffer(podcast_audio, dtype=np.int16)
87
-
88
- return (24000, podcast_audio) # edge-tts uses 24000 Hz sample rate
89
 
90
  async def get_voice_list():
91
- voices = await edge_tts.list_voices()
92
- voice_dict = {}
93
- for voice in voices:
94
- lang = voice["Locale"]
95
- if lang not in voice_dict:
96
- voice_dict[lang] = []
97
- voice_dict[lang].append(voice["Name"])
98
- return voice_dict
99
-
100
- # Language names dictionary
101
  language_names = {
102
- 'af-ZA': 'Afrikaans (South Africa)', 'am-ET': 'Amharic (Ethiopia)', 'ar-AE': 'Arabic (UAE)', 'ar-BH': 'Arabic (Bahrain)',
103
- 'ar-DZ': 'Arabic (Algeria)', 'ar-EG': 'Arabic (Egypt)', 'ar-IQ': 'Arabic (Iraq)', 'ar-JO': 'Arabic (Jordan)',
104
- 'ar-KW': 'Arabic (Kuwait)', 'ar-LB': 'Arabic (Lebanon)', 'ar-LY': 'Arabic (Libya)', 'ar-MA': 'Arabic (Morocco)',
105
- 'ar-OM': 'Arabic (Oman)', 'ar-QA': 'Arabic (Qatar)', 'ar-SA': 'Arabic (Saudi Arabia)', 'ar-SY': 'Arabic (Syria)',
106
- 'ar-TN': 'Arabic (Tunisia)', 'ar-YE': 'Arabic (Yemen)', 'az-AZ': 'Azerbaijani (Azerbaijan)', 'bg-BG': 'Bulgarian (Bulgaria)',
107
- 'bn-BD': 'Bengali (Bangladesh)', 'bn-IN': 'Bengali (India)', 'bs-BA': 'Bosnian (Bosnia and Herzegovina)', 'ca-ES': 'Catalan (Spain)',
108
- 'cs-CZ': 'Czech (Czech Republic)', 'cy-GB': 'Welsh (United Kingdom)', 'da-DK': 'Danish (Denmark)', 'de-AT': 'German (Austria)',
109
- 'de-CH': 'German (Switzerland)', 'de-DE': 'German (Germany)', 'el-GR': 'Greek (Greece)', 'en-AU': 'English (Australia)',
110
- 'en-CA': 'English (Canada)', 'en-GB': 'English (United Kingdom)', 'en-GH': 'English (Ghana)', 'en-HK': 'English (Hong Kong SAR)',
111
- 'en-IE': 'English (Ireland)', 'en-IN': 'English (India)', 'en-KE': 'English (Kenya)', 'en-NG': 'English (Nigeria)',
112
- 'en-NZ': 'English (New Zealand)', 'en-PH': 'English (Philippines)', 'en-SG': 'English (Singapore)', 'en-TZ': 'English (Tanzania)',
113
- 'en-US': 'English (United States)', 'en-ZA': 'English (South Africa)', 'es-AR': 'Spanish (Argentina)', 'es-BO': 'Spanish (Bolivia)',
114
- 'es-CL': 'Spanish (Chile)', 'es-CO': 'Spanish (Colombia)', 'es-CR': 'Spanish (Costa Rica)', 'es-CU': 'Spanish (Cuba)',
115
- 'es-DO': 'Spanish (Dominican Republic)', 'es-EC': 'Spanish (Ecuador)', 'es-ES': 'Spanish (Spain)', 'es-GQ': 'Spanish (Equatorial Guinea)',
116
- 'es-GT': 'Spanish (Guatemala)', 'es-HN': 'Spanish (Honduras)', 'es-MX': 'Spanish (Mexico)', 'es-NI': 'Spanish (Nicaragua)',
117
- 'es-PA': 'Spanish (Panama)', 'es-PE': 'Spanish (Peru)', 'es-PR': 'Spanish (Puerto Rico)', 'es-PY': 'Spanish (Paraguay)',
118
- 'es-SV': 'Spanish (El Salvador)', 'es-US': 'Spanish (United States)', 'es-UY': 'Spanish (Uruguay)', 'es-VE': 'Spanish (Venezuela)',
119
- 'et-EE': 'Estonian (Estonia)', 'eu-ES': 'Basque (Spain)', 'fa-IR': 'Persian (Iran)', 'fi-FI': 'Finnish (Finland)',
120
- 'fil-PH': 'Filipino (Philippines)', 'fr-BE': 'French (Belgium)', 'fr-CA': 'French (Canada)', 'fr-CH': 'French (Switzerland)',
121
- 'fr-FR': 'French (France)', 'ga-IE': 'Irish (Ireland)', 'gl-ES': 'Galician (Spain)', 'gu-IN': 'Gujarati (India)',
122
- 'he-IL': 'Hebrew (Israel)', 'hi-IN': 'Hindi (India)', 'hr-HR': 'Croatian (Croatia)', 'hu-HU': 'Hungarian (Hungary)',
123
- 'hy-AM': 'Armenian (Armenia)', 'id-ID': 'Indonesian (Indonesia)', 'is-IS': 'Icelandic (Iceland)', 'it-IT': 'Italian (Italy)',
124
- 'ja-JP': 'Japanese (Japan)', 'jv-ID': 'Javanese (Indonesia)', 'ka-GE': 'Georgian (Georgia)', 'kk-KZ': 'Kazakh (Kazakhstan)',
125
- 'km-KH': 'Khmer (Cambodia)', 'kn-IN': 'Kannada (India)', 'ko-KR': 'Korean (Korea)', 'lo-LA': 'Lao (Laos)',
126
- 'lt-LT': 'Lithuanian (Lithuania)', 'lv-LV': 'Latvian (Latvia)', 'mk-MK': 'Macedonian (North Macedonia)', 'ml-IN': 'Malayalam (India)',
127
- 'mn-MN': 'Mongolian (Mongolia)', 'mr-IN': 'Marathi (India)', 'ms-MY': 'Malay (Malaysia)', 'mt-MT': 'Maltese (Malta)',
128
- 'my-MM': 'Burmese (Myanmar)', 'nb-NO': 'Norwegian (Bokmål, Norway)', 'ne-NP': 'Nepali (Nepal)', 'nl-BE': 'Dutch (Belgium)',
129
- 'nl-NL': 'Dutch (Netherlands)', 'pl-PL': 'Polish (Poland)', 'ps-AF': 'Pashto (Afghanistan)', 'pt-BR': 'Portuguese (Brazil)',
130
- 'pt-PT': 'Portuguese (Portugal)', 'ro-RO': 'Romanian (Romania)', 'ru-RU': 'Russian (Russia)', 'si-LK': 'Sinhala (Sri Lanka)',
131
- 'sk-SK': 'Slovak (Slovakia)', 'sl-SI': 'Slovenian (Slovenia)', 'so-SO': 'Somali (Somalia)', 'sq-AL': 'Albanian (Albania)',
132
- 'sr-RS': 'Serbian (Serbia)', 'sv-SE': 'Swedish (Sweden)', 'sw-KE': 'Swahili (Kenya)', 'sw-TZ': 'Swahili (Tanzania)',
133
- 'ta-IN': 'Tamil (India)', 'ta-LK': 'Tamil (Sri Lanka)', 'ta-MY': 'Tamil (Malaysia)', 'ta-SG': 'Tamil (Singapore)',
134
- 'te-IN': 'Telugu (India)', 'th-TH': 'Thai (Thailand)', 'tr-TR': 'Turkish (Turkey)', 'uk-UA': 'Ukrainian (Ukraine)',
135
- 'ur-IN': 'Urdu (India)', 'ur-PK': 'Urdu (Pakistan)', 'uz-UZ': 'Uzbek (Uzbekistan)', 'vi-VN': 'Vietnamese (Vietnam)',
136
- 'wuu-CN': 'Wu Chinese (China)', 'yue-CN': 'Cantonese (China)', 'zh-CN': 'Chinese (Mandarin, Simplified)',
137
- 'zh-HK': 'Chinese (Cantonese, Traditional)', 'zh-TW': 'Chinese (Taiwanese Mandarin)', 'zu-ZA': 'Zulu (South Africa)'
138
  }
139
 
140
- # Gradio Interface
141
- with gr.Blocks() as demo:
142
- gr.Markdown("# AI Podcast Generator")
143
-
144
- api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
145
-
146
- with gr.Row():
147
- content_input = gr.Textbox(label="Paste your content or upload a document")
148
- document_upload = gr.File(label="Upload Document")
149
-
150
- duration = gr.Radio(
151
- ["1-5 min", "5-10 min", "10-15 min"],
152
- label="Estimated podcast duration",
153
- value="1-5 min" # This sets the default value
154
- )
155
-
156
- num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
157
-
158
- voice_dict = asyncio.run(get_voice_list())
159
- languages = list(voice_dict.keys())
160
- languages.insert(0, "None") # Add "None" option for single speaker
161
-
162
- default_voice1 = "Microsoft Server Speech Text to Speech Voice (en-US, AvaNeural)"
163
- default_voice2 = "Microsoft Server Speech Text to Speech Voice (en-US, AndrewNeural)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- with gr.Row():
166
- lang1_select = gr.Dropdown(label="Select Language 1", choices=[f"{language_names.get(lang, lang)}" for lang in languages], value="English (United States)")
167
- voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_dict.get('en-US', []), value=default_voice1)
168
-
169
- with gr.Row():
170
- lang2_select = gr.Dropdown(label="Select Language 2", choices=[f"{language_names.get(lang, lang)}" for lang in languages], value="English (United States)")
171
- voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_dict.get('en-US', []), value=default_voice2)
172
-
173
- generate_btn = gr.Button("Generate Script")
174
- script_output = gr.Textbox(label="Generated Script", lines=10)
175
-
176
- render_btn = gr.Button("Render Podcast")
177
- audio_output = gr.Audio(label="Generated Podcast")
178
-
179
- def update_voices(lang):
180
- if lang == "None":
181
- return gr.Dropdown(choices=[], value=None)
182
- selected_lang = next((key for key, value in language_names.items() if value == lang), None)
183
- voices = voice_dict.get(selected_lang, [])
184
-
185
- if lang == "English (United States)":
186
- if default_voice1 in voices:
187
- return gr.Dropdown(choices=voices, value=default_voice1)
188
- elif default_voice2 in voices:
189
- return gr.Dropdown(choices=voices, value=default_voice2)
190
-
191
- return gr.Dropdown(choices=voices, value=voices[0] if voices else None)
192
-
193
- lang1_select.change(update_voices, inputs=[lang1_select], outputs=[voice1_select])
194
- lang2_select.change(update_voices, inputs=[lang2_select], outputs=[voice2_select])
195
-
196
- def generate_script_wrapper(api_key, content, duration, num_hosts):
197
- return generate_podcast_script(api_key, content, duration, num_hosts)
198
-
199
- async def render_podcast_wrapper(api_key, script, voice1, voice2, num_hosts):
200
- return await render_podcast(api_key, script, voice1, voice2, num_hosts)
201
-
202
- generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration, num_hosts], outputs=script_output)
203
- render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select, num_hosts], outputs=audio_output)
 
 
 
 
 
 
 
 
 
 
204
 
205
- def update_second_voice_visibility(num_hosts):
206
- return gr.update(visible=num_hosts == 2), gr.update(visible=num_hosts == 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
- num_hosts.change(update_second_voice_visibility, inputs=[num_hosts], outputs=[lang2_select, voice2_select])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- if __name__ == "__main__":
211
- demo.launch()
 
 
 
1
+ import dash
2
+ from dash import dcc, html, Input, Output, State, callback
3
+ import dash_bootstrap_components as dbc
4
+ import base64
5
+ import io
6
  import google.generativeai as genai
7
  import numpy as np
8
  import edge_tts
9
  import asyncio
 
10
  import re
11
+ import logging
12
+ import json
13
+ from dash.exceptions import PreventUpdate
14
 
15
  # Set up logging
 
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
+ # Initialize Dash app
20
+ app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
21
+
22
  # Initialize Gemini AI
23
  genai.configure(api_key='YOUR_GEMINI_API_KEY')
24
 
25
+ # Helper functions (keep the existing functions)
26
  def generate_podcast_script(api_key, content, duration, num_hosts):
27
+ # ... (keep the existing implementation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  async def text_to_speech(text, voice):
30
+ # ... (keep the existing implementation)
 
 
 
 
 
 
31
 
32
  async def render_podcast(api_key, script, voice1, voice2, num_hosts):
33
+ # ... (keep the existing implementation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  async def get_voice_list():
36
+ # ... (keep the existing implementation)
37
+
38
+ # Language names dictionary (keep the existing dictionary)
 
 
 
 
 
 
 
39
  language_names = {
40
+ # ... (keep the existing dictionary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
 
43
+ # Layout
44
+ app.layout = dbc.Container([
45
+ html.H1("AI Podcast Generator", className="my-4"),
46
+
47
+ dbc.Card([
48
+ dbc.CardBody([
49
+ dbc.Input(id="api-key-input", type="password", placeholder="Enter your Gemini API Key"),
50
+ dbc.Textarea(id="content-input", placeholder="Paste your content or upload a document", rows=5),
51
+ dcc.Upload(
52
+ id='document-upload',
53
+ children=html.Div(['Drag and Drop or ', html.A('Select a File')]),
54
+ style={
55
+ 'width': '100%',
56
+ 'height': '60px',
57
+ 'lineHeight': '60px',
58
+ 'borderWidth': '1px',
59
+ 'borderStyle': 'dashed',
60
+ 'borderRadius': '5px',
61
+ 'textAlign': 'center',
62
+ 'margin': '10px 0'
63
+ }
64
+ ),
65
+ dcc.RadioItems(
66
+ id="duration",
67
+ options=[
68
+ {'label': '1-5 min', 'value': '1-5 min'},
69
+ {'label': '5-10 min', 'value': '5-10 min'},
70
+ {'label': '10-15 min', 'value': '10-15 min'}
71
+ ],
72
+ value='1-5 min',
73
+ inline=True
74
+ ),
75
+ dcc.RadioItems(
76
+ id="num-hosts",
77
+ options=[
78
+ {'label': '1 host', 'value': 1},
79
+ {'label': '2 hosts', 'value': 2}
80
+ ],
81
+ value=2,
82
+ inline=True
83
+ ),
84
+ dcc.Dropdown(id="lang1-select", options=[{'label': lang, 'value': lang} for lang in language_names.values()], value="English (United States)"),
85
+ dcc.Dropdown(id="voice1-select"),
86
+ html.Div([
87
+ dcc.Dropdown(id="lang2-select", options=[{'label': lang, 'value': lang} for lang in language_names.values()], value="English (United States)"),
88
+ dcc.Dropdown(id="voice2-select")
89
+ ], id="second-voice-container"),
90
+ dbc.Button("Generate Script", id="generate-btn", color="primary", className="mt-3"),
91
+ dbc.Textarea(id="script-output", rows=10, className="mt-3"),
92
+ dbc.Button("Render Podcast", id="render-btn", color="success", className="mt-3"),
93
+ html.Div(id="audio-output")
94
+ ])
95
+ ], className="mt-4")
96
+ ], fluid=True)
97
 
98
+ # Callbacks
99
+ @app.callback(
100
+ Output("voice1-select", "options"),
101
+ Input("lang1-select", "value")
102
+ )
103
+ def update_voice1_options(lang):
104
+ selected_lang = next((key for key, value in language_names.items() if value == lang), None)
105
+ voices = asyncio.run(get_voice_list()).get(selected_lang, [])
106
+ return [{'label': voice, 'value': voice} for voice in voices]
107
+
108
+ @app.callback(
109
+ Output("voice2-select", "options"),
110
+ Input("lang2-select", "value")
111
+ )
112
+ def update_voice2_options(lang):
113
+ selected_lang = next((key for key, value in language_names.items() if value == lang), None)
114
+ voices = asyncio.run(get_voice_list()).get(selected_lang, [])
115
+ return [{'label': voice, 'value': voice} for voice in voices]
116
+
117
+ @app.callback(
118
+ Output("second-voice-container", "style"),
119
+ Input("num-hosts", "value")
120
+ )
121
+ def update_second_voice_visibility(num_hosts):
122
+ return {'display': 'block' if num_hosts == 2 else 'none'}
123
+
124
+ @app.callback(
125
+ Output("content-input", "value"),
126
+ Input("document-upload", "contents"),
127
+ State("document-upload", "filename")
128
+ )
129
+ def update_content(contents, filename):
130
+ if contents is not None:
131
+ content_type, content_string = contents.split(',')
132
+ decoded = base64.b64decode(content_string)
133
+ try:
134
+ if 'csv' in filename:
135
+ df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
136
+ return df.to_string()
137
+ elif 'xls' in filename:
138
+ df = pd.read_excel(io.BytesIO(decoded))
139
+ return df.to_string()
140
+ elif 'txt' in filename:
141
+ return decoded.decode('utf-8')
142
+ else:
143
+ return 'Unsupported file type'
144
+ except Exception as e:
145
+ return f'Error processing file: {str(e)}'
146
+ return ''
147
 
148
+ @app.callback(
149
+ Output("script-output", "value"),
150
+ Input("generate-btn", "n_clicks"),
151
+ State("api-key-input", "value"),
152
+ State("content-input", "value"),
153
+ State("duration", "value"),
154
+ State("num-hosts", "value")
155
+ )
156
+ def generate_script(n_clicks, api_key, content, duration, num_hosts):
157
+ if n_clicks is None:
158
+ raise PreventUpdate
159
+ try:
160
+ script = generate_podcast_script(api_key, content, duration, num_hosts)
161
+ return script
162
+ except Exception as e:
163
+ return f"Error generating script: {str(e)}"
164
 
165
+ @app.callback(
166
+ Output("audio-output", "children"),
167
+ Input("render-btn", "n_clicks"),
168
+ State("api-key-input", "value"),
169
+ State("script-output", "value"),
170
+ State("voice1-select", "value"),
171
+ State("voice2-select", "value"),
172
+ State("num-hosts", "value")
173
+ )
174
+ def render_podcast_audio(n_clicks, api_key, script, voice1, voice2, num_hosts):
175
+ if n_clicks is None:
176
+ raise PreventUpdate
177
+ try:
178
+ sample_rate, audio_data = asyncio.run(render_podcast(api_key, script, voice1, voice2, num_hosts))
179
+ audio_base64 = base64.b64encode(audio_data.tobytes()).decode('utf-8')
180
+ return html.Audio(src=f"data:audio/wav;base64,{audio_base64}", controls=True)
181
+ except Exception as e:
182
+ return html.Div(f"Error rendering podcast: {str(e)}", style={'color': 'red'})
183
 
184
+ if __name__ == '__main__':
185
+ print("Starting the Dash application...")
186
+ app.run(debug=True, host='0.0.0.0', port=7860)
187
+ print("Dash application has finished running.")