bluenevus commited on
Commit
b8d465b
·
verified ·
1 Parent(s): 1e7b36b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -44
app.py CHANGED
@@ -1,52 +1,165 @@
1
  import gradio as gr
2
- from gtts import gTTS
 
 
 
3
  import io
4
- import os
5
- from pydub import AudioSegment
6
 
7
- def text_to_speech(text):
8
- lines = text.split('\n')
9
- audio_segments = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
 
 
 
 
11
  for i, line in enumerate(lines):
12
  if line.strip(): # Skip empty lines
13
- tts = gTTS(text=line, lang='en', slow=False)
14
-
15
- # Save to a temporary file
16
- temp_filename = f"temp_{i}.mp3"
17
- tts.save(temp_filename)
18
-
19
- # Load the audio segment
20
- audio_segment = AudioSegment.from_mp3(temp_filename)
21
-
22
- # Alternate between two voices
23
- if i % 2 == 0:
24
- audio_segment = audio_segment.set_frame_rate(44100) # Higher pitch for even lines (speaker 1)
25
- else:
26
- audio_segment = audio_segment.set_frame_rate(22050) # Lower pitch for odd lines (speaker 2)
27
-
28
- audio_segments.append(audio_segment)
29
-
30
- # Remove the temporary file
31
- os.remove(temp_filename)
32
-
33
- # Concatenate all audio segments
34
- final_audio = sum(audio_segments)
35
-
36
- # Export the final audio to a byte stream
37
- buffer = io.BytesIO()
38
- final_audio.export(buffer, format="mp3")
39
- buffer.seek(0)
40
-
41
- return buffer
42
-
43
- iface = gr.Interface(
44
- fn=text_to_speech,
45
- inputs=gr.Textbox(lines=10, placeholder="Enter your text here..."),
46
- outputs=gr.Audio(type="binary"),
47
- title="Two-Speaker Text-to-Speech Converter",
48
- description="Convert text to speech with alternating voices for even and odd lines."
49
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  if __name__ == "__main__":
52
- iface.launch()
 
1
  import gradio as gr
2
+ import google.generativeai as genai
3
+ import numpy as np
4
+ import edge_tts
5
+ import asyncio
6
  import io
7
+ import re
 
8
 
9
+ # Set up logging
10
+ import logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Initialize Gemini AI
15
+ genai.configure(api_key='YOUR_GEMINI_API_KEY')
16
+
17
+ def generate_podcast_script(api_key, content, duration):
18
+ genai.configure(api_key=api_key)
19
+ model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
20
+
21
+ prompt = f"""
22
+ Create a podcast script for two people discussing the following content:
23
+ {content}
24
+
25
+ The podcast should last approximately {duration}. Include natural speech patterns,
26
+ humor, and occasional off-topic chit-chat. Use speech fillers like um, ah,
27
+ yes, I see, Ok now. Vary the emotional tone.
28
+ Format the script as alternating lines of dialogue without speaker labels.
29
+ Do not use any special characters, markdown, or formatting. Only include the alternating dialogue lines.
30
+ Ensure the conversation flows naturally and stays relevant to the topic.
31
+ Limit the script length to match the requested duration of {duration}.
32
+ """
33
+ response = model.generate_content(prompt)
34
+ # Remove any special characters that might be read aloud
35
+ clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', response.text)
36
+ return clean_text
37
+
38
+ async def text_to_speech(text, voice):
39
+ communicate = edge_tts.Communicate(text, voice)
40
+ audio = io.BytesIO()
41
+ async for chunk in communicate.stream():
42
+ if chunk["type"] == "audio":
43
+ audio.write(chunk["data"])
44
+ audio.seek(0)
45
+ return audio.read()
46
 
47
+ async def render_podcast(api_key, script, voice1, voice2):
48
+ lines = script.split('\n')
49
+ audio_segments = []
50
+
51
  for i, line in enumerate(lines):
52
  if line.strip(): # Skip empty lines
53
+ voice = voice1 if i % 2 == 0 else voice2
54
+ audio = await text_to_speech(line, voice)
55
+ audio_segments.append(audio)
56
+
57
+ if not audio_segments:
58
+ logger.warning("No valid audio segments were generated.")
59
+ return (24000, np.zeros(24000, dtype=np.int16)) # Return silence if no valid audio was generated
60
+
61
+ # Concatenate audio segments
62
+ podcast_audio = b''.join(audio_segments)
63
+
64
+ # Convert to numpy array
65
+ podcast_audio = np.frombuffer(podcast_audio, dtype=np.int16)
66
+
67
+ return (24000, podcast_audio) # edge-tts uses 24000 Hz sample rate
68
+
69
+ async def get_voice_list():
70
+ voices = await edge_tts.list_voices()
71
+ voice_dict = {}
72
+ for voice in voices:
73
+ lang = voice["Locale"]
74
+ if lang not in voice_dict:
75
+ voice_dict[lang] = []
76
+ voice_dict[lang].append(voice["Name"])
77
+ return voice_dict
78
+
79
+ # Language names dictionary
80
+ language_names = {
81
+ 'af-ZA': 'Afrikaans (South Africa)', 'am-ET': 'Amharic (Ethiopia)', 'ar-AE': 'Arabic (UAE)', 'ar-BH': 'Arabic (Bahrain)',
82
+ 'ar-DZ': 'Arabic (Algeria)', 'ar-EG': 'Arabic (Egypt)', 'ar-IQ': 'Arabic (Iraq)', 'ar-JO': 'Arabic (Jordan)',
83
+ 'ar-KW': 'Arabic (Kuwait)', 'ar-LB': 'Arabic (Lebanon)', 'ar-LY': 'Arabic (Libya)', 'ar-MA': 'Arabic (Morocco)',
84
+ 'ar-OM': 'Arabic (Oman)', 'ar-QA': 'Arabic (Qatar)', 'ar-SA': 'Arabic (Saudi Arabia)', 'ar-SY': 'Arabic (Syria)',
85
+ 'ar-TN': 'Arabic (Tunisia)', 'ar-YE': 'Arabic (Yemen)', 'az-AZ': 'Azerbaijani (Azerbaijan)', 'bg-BG': 'Bulgarian (Bulgaria)',
86
+ 'bn-BD': 'Bengali (Bangladesh)', 'bn-IN': 'Bengali (India)', 'bs-BA': 'Bosnian (Bosnia and Herzegovina)', 'ca-ES': 'Catalan (Spain)',
87
+ 'cs-CZ': 'Czech (Czech Republic)', 'cy-GB': 'Welsh (United Kingdom)', 'da-DK': 'Danish (Denmark)', 'de-AT': 'German (Austria)',
88
+ 'de-CH': 'German (Switzerland)', 'de-DE': 'German (Germany)', 'el-GR': 'Greek (Greece)', 'en-AU': 'English (Australia)',
89
+ 'en-CA': 'English (Canada)', 'en-GB': 'English (United Kingdom)', 'en-GH': 'English (Ghana)', 'en-HK': 'English (Hong Kong SAR)',
90
+ 'en-IE': 'English (Ireland)', 'en-IN': 'English (India)', 'en-KE': 'English (Kenya)', 'en-NG': 'English (Nigeria)',
91
+ 'en-NZ': 'English (New Zealand)', 'en-PH': 'English (Philippines)', 'en-SG': 'English (Singapore)', 'en-TZ': 'English (Tanzania)',
92
+ 'en-US': 'English (United States)', 'en-ZA': 'English (South Africa)', 'es-AR': 'Spanish (Argentina)', 'es-BO': 'Spanish (Bolivia)',
93
+ 'es-CL': 'Spanish (Chile)', 'es-CO': 'Spanish (Colombia)', 'es-CR': 'Spanish (Costa Rica)', 'es-CU': 'Spanish (Cuba)',
94
+ 'es-DO': 'Spanish (Dominican Republic)', 'es-EC': 'Spanish (Ecuador)', 'es-ES': 'Spanish (Spain)', 'es-GQ': 'Spanish (Equatorial Guinea)',
95
+ 'es-GT': 'Spanish (Guatemala)', 'es-HN': 'Spanish (Honduras)', 'es-MX': 'Spanish (Mexico)', 'es-NI': 'Spanish (Nicaragua)',
96
+ 'es-PA': 'Spanish (Panama)', 'es-PE': 'Spanish (Peru)', 'es-PR': 'Spanish (Puerto Rico)', 'es-PY': 'Spanish (Paraguay)',
97
+ 'es-SV': 'Spanish (El Salvador)', 'es-US': 'Spanish (United States)', 'es-UY': 'Spanish (Uruguay)', 'es-VE': 'Spanish (Venezuela)',
98
+ 'et-EE': 'Estonian (Estonia)', 'eu-ES': 'Basque (Spain)', 'fa-IR': 'Persian (Iran)', 'fi-FI': 'Finnish (Finland)',
99
+ 'fil-PH': 'Filipino (Philippines)', 'fr-BE': 'French (Belgium)', 'fr-CA': 'French (Canada)', 'fr-CH': 'French (Switzerland)',
100
+ 'fr-FR': 'French (France)', 'ga-IE': 'Irish (Ireland)', 'gl-ES': 'Galician (Spain)', 'gu-IN': 'Gujarati (India)',
101
+ 'he-IL': 'Hebrew (Israel)', 'hi-IN': 'Hindi (India)', 'hr-HR': 'Croatian (Croatia)', 'hu-HU': 'Hungarian (Hungary)',
102
+ 'hy-AM': 'Armenian (Armenia)', 'id-ID': 'Indonesian (Indonesia)', 'is-IS': 'Icelandic (Iceland)', 'it-IT': 'Italian (Italy)',
103
+ 'ja-JP': 'Japanese (Japan)', 'jv-ID': 'Javanese (Indonesia)', 'ka-GE': 'Georgian (Georgia)', 'kk-KZ': 'Kazakh (Kazakhstan)',
104
+ 'km-KH': 'Khmer (Cambodia)', 'kn-IN': 'Kannada (India)', 'ko-KR': 'Korean (Korea)', 'lo-LA': 'Lao (Laos)',
105
+ 'lt-LT': 'Lithuanian (Lithuania)', 'lv-LV': 'Latvian (Latvia)', 'mk-MK': 'Macedonian (North Macedonia)', 'ml-IN': 'Malayalam (India)',
106
+ 'mn-MN': 'Mongolian (Mongolia)', 'mr-IN': 'Marathi (India)', 'ms-MY': 'Malay (Malaysia)', 'mt-MT': 'Maltese (Malta)',
107
+ 'my-MM': 'Burmese (Myanmar)', 'nb-NO': 'Norwegian (Bokmål, Norway)', 'ne-NP': 'Nepali (Nepal)', 'nl-BE': 'Dutch (Belgium)',
108
+ 'nl-NL': 'Dutch (Netherlands)', 'pl-PL': 'Polish (Poland)', 'ps-AF': 'Pashto (Afghanistan)', 'pt-BR': 'Portuguese (Brazil)',
109
+ 'pt-PT': 'Portuguese (Portugal)', 'ro-RO': 'Romanian (Romania)', 'ru-RU': 'Russian (Russia)', 'si-LK': 'Sinhala (Sri Lanka)',
110
+ 'sk-SK': 'Slovak (Slovakia)', 'sl-SI': 'Slovenian (Slovenia)', 'so-SO': 'Somali (Somalia)', 'sq-AL': 'Albanian (Albania)',
111
+ 'sr-RS': 'Serbian (Serbia)', 'sv-SE': 'Swedish (Sweden)', 'sw-KE': 'Swahili (Kenya)', 'sw-TZ': 'Swahili (Tanzania)',
112
+ 'ta-IN': 'Tamil (India)', 'ta-LK': 'Tamil (Sri Lanka)', 'ta-MY': 'Tamil (Malaysia)', 'ta-SG': 'Tamil (Singapore)',
113
+ 'te-IN': 'Telugu (India)', 'th-TH': 'Thai (Thailand)', 'tr-TR': 'Turkish (Turkey)', 'uk-UA': 'Ukrainian (Ukraine)',
114
+ 'ur-IN': 'Urdu (India)', 'ur-PK': 'Urdu (Pakistan)', 'uz-UZ': 'Uzbek (Uzbekistan)', 'vi-VN': 'Vietnamese (Vietnam)',
115
+ 'wuu-CN': 'Wu Chinese (China)', 'yue-CN': 'Cantonese (China)', 'zh-CN': 'Chinese (Mandarin, Simplified)',
116
+ 'zh-HK': 'Chinese (Cantonese, Traditional)', 'zh-TW': 'Chinese (Taiwanese Mandarin)', 'zu-ZA': 'Zulu (South Africa)'
117
+ }
118
+
119
+ # Gradio Interface
120
+ with gr.Blocks() as demo:
121
+ gr.Markdown("# AI Podcast Generator")
122
+
123
+ api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
124
+
125
+ with gr.Row():
126
+ content_input = gr.Textbox(label="Paste your content or upload a document")
127
+ document_upload = gr.File(label="Upload Document")
128
+
129
+ duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
130
+
131
+ voice_dict = asyncio.run(get_voice_list())
132
+ languages = list(voice_dict.keys())
133
+
134
+ with gr.Row():
135
+ lang1_select = gr.Dropdown(label="Select Language 1", choices=[f"{language_names.get(lang, lang)}" for lang in languages])
136
+ voice1_select = gr.Dropdown(label="Select Voice 1")
137
+
138
+ with gr.Row():
139
+ lang2_select = gr.Dropdown(label="Select Language 2", choices=[f"{language_names.get(lang, lang)}" for lang in languages])
140
+ voice2_select = gr.Dropdown(label="Select Voice 2")
141
+
142
+ generate_btn = gr.Button("Generate Script")
143
+ script_output = gr.Textbox(label="Generated Script", lines=10)
144
+
145
+ render_btn = gr.Button("Render Podcast")
146
+ audio_output = gr.Audio(label="Generated Podcast")
147
+
148
+ def update_voices(lang):
149
+ selected_lang = next((key for key, value in language_names.items() if value == lang), None)
150
+ return gr.Dropdown(choices=voice_dict.get(selected_lang, []))
151
+
152
+ lang1_select.change(update_voices, inputs=[lang1_select], outputs=[voice1_select])
153
+ lang2_select.change(update_voices, inputs=[lang2_select], outputs=[voice2_select])
154
+
155
+ def generate_script_wrapper(api_key, content, duration):
156
+ return generate_podcast_script(api_key, content, duration)
157
+
158
+ async def render_podcast_wrapper(api_key, script, voice1, voice2):
159
+ return await render_podcast(api_key, script, voice1, voice2)
160
+
161
+ generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration], outputs=script_output)
162
+ render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select], outputs=audio_output)
163
 
164
  if __name__ == "__main__":
165
+ demo.launch()