deeme commited on
Commit
54dd079
·
verified ·
1 Parent(s): 6543342

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +462 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ import logging
5
+ from podcastfy.client import generate_podcast
6
+ from dotenv import load_dotenv
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.DEBUG)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ def get_api_key(key_name, ui_value):
16
+ return ui_value if ui_value else os.getenv(key_name)
17
+
18
+ def process_inputs(
19
+ text_input,
20
+ urls_input,
21
+ pdf_files,
22
+ image_files,
23
+ gemini_key,
24
+ openai_key,
25
+ openai_base_url, # 新增参数
26
+ elevenlabs_key,
27
+ word_count,
28
+ conversation_style,
29
+ roles_person1,
30
+ roles_person2,
31
+ dialogue_structure,
32
+ podcast_name,
33
+ podcast_tagline,
34
+ output_language,
35
+ tts_model,
36
+ creativity_level,
37
+ user_instructions,
38
+ api_key_label,
39
+ llm_model_name,
40
+ longform,
41
+ ):
42
+ try:
43
+ logger.info("Starting podcast generation process")
44
+
45
+ # API key handling
46
+ logger.debug("Setting API keys")
47
+ os.environ["GEMINI_API_KEY"] = get_api_key("GEMINI_API_KEY", gemini_key)
48
+
49
+ if tts_model == "openai":
50
+ logger.debug("Setting OpenAI API key")
51
+ if not openai_key and not os.getenv("OPENAI_API_KEY"):
52
+ raise ValueError("OpenAI API key is required when using OpenAI TTS model")
53
+ os.environ["OPENAI_API_KEY"] = get_api_key("OPENAI_API_KEY", openai_key)
54
+ if openai_base_url:
55
+ os.environ["OPENAI_API_BASE"] = openai_base_url
56
+
57
+ if tts_model == "elevenlabs":
58
+ logger.debug("Setting ElevenLabs API key")
59
+ if not elevenlabs_key and not os.getenv("ELEVENLABS_API_KEY"):
60
+ raise ValueError("ElevenLabs API key is required when using ElevenLabs TTS model")
61
+ os.environ["ELEVENLABS_API_KEY"] = get_api_key("ELEVENLABS_API_KEY", elevenlabs_key)
62
+
63
+ # Process URLs
64
+ urls = [url.strip() for url in urls_input.split('\n') if url.strip()]
65
+ logger.debug(f"Processed URLs: {urls}")
66
+
67
+ temp_files = []
68
+ temp_dirs = []
69
+
70
+ # Handle PDF files
71
+ if pdf_files is not None and len(pdf_files) > 0:
72
+ logger.info(f"Processing {len(pdf_files)} PDF files")
73
+ pdf_temp_dir = tempfile.mkdtemp()
74
+ temp_dirs.append(pdf_temp_dir)
75
+
76
+ for i, pdf_file in enumerate(pdf_files):
77
+ pdf_path = os.path.join(pdf_temp_dir, f"input_pdf_{i}.pdf")
78
+ temp_files.append(pdf_path)
79
+
80
+ with open(pdf_path, 'wb') as f:
81
+ f.write(pdf_file)
82
+ urls.append(pdf_path)
83
+ logger.debug(f"Saved PDF {i} to {pdf_path}")
84
+
85
+ # Handle image files
86
+ image_paths = []
87
+ if image_files is not None and len(image_files) > 0:
88
+ logger.info(f"Processing {len(image_files)} image files")
89
+ img_temp_dir = tempfile.mkdtemp()
90
+ temp_dirs.append(img_temp_dir)
91
+
92
+ for i, img_file in enumerate(image_files):
93
+ # Get file extension from the original name in the file tuple
94
+ original_name = img_file.orig_name if hasattr(img_file, 'orig_name') else f"image_{i}.jpg"
95
+ extension = original_name.split('.')[-1]
96
+
97
+ logger.debug(f"Processing image file {i}: {original_name}")
98
+ img_path = os.path.join(img_temp_dir, f"input_image_{i}.{extension}")
99
+ temp_files.append(img_path)
100
+
101
+ try:
102
+ # Write the bytes directly to the file
103
+ with open(img_path, 'wb') as f:
104
+ if isinstance(img_file, (tuple, list)):
105
+ f.write(img_file[1]) # Write the bytes content
106
+ else:
107
+ f.write(img_file) # Write the bytes directly
108
+ image_paths.append(img_path)
109
+ logger.debug(f"Saved image {i} to {img_path}")
110
+ except Exception as e:
111
+ logger.error(f"Error saving image {i}: {str(e)}")
112
+ raise
113
+
114
+ # Prepare conversation config
115
+ logger.debug("Preparing conversation config")
116
+ conversation_config = {
117
+ "word_count": word_count,
118
+ "conversation_style": conversation_style.split(','),
119
+ "roles_person1": roles_person1,
120
+ "roles_person2": roles_person2,
121
+ "dialogue_structure": dialogue_structure.split(','),
122
+ "podcast_name": podcast_name,
123
+ "podcast_tagline": podcast_tagline,
124
+ "output_language": output_language,
125
+ "creativity": creativity_level,
126
+ "user_instructions": user_instructions,
127
+ "api_key_label": api_key_label,
128
+ "llm_model_name": llm_model_name,
129
+ "longform": longform,
130
+ }
131
+
132
+ # Generate podcast
133
+ logger.info("Calling generate_podcast function")
134
+ logger.debug(f"URLs: {urls}")
135
+ logger.debug(f"Image paths: {image_paths}")
136
+ logger.debug(f"Text input present: {'Yes' if text_input else 'No'}")
137
+
138
+ audio_file = generate_podcast(
139
+ urls=urls if urls else None,
140
+ text=text_input if text_input else None,
141
+ image_paths=image_paths if image_paths else None,
142
+ tts_model=tts_model,
143
+ conversation_config=conversation_config
144
+ )
145
+
146
+ logger.info("Podcast generation completed")
147
+
148
+ # Cleanup
149
+ logger.debug("Cleaning up temporary files")
150
+ for file_path in temp_files:
151
+ if os.path.exists(file_path):
152
+ os.unlink(file_path)
153
+ logger.debug(f"Removed temp file: {file_path}")
154
+ for dir_path in temp_dirs:
155
+ if os.path.exists(dir_path):
156
+ os.rmdir(dir_path)
157
+ logger.debug(f"Removed temp directory: {dir_path}")
158
+
159
+ return audio_file
160
+
161
+ except Exception as e:
162
+ logger.error(f"Error in process_inputs: {str(e)}", exc_info=True)
163
+ # Cleanup on error
164
+ for file_path in temp_files:
165
+ if os.path.exists(file_path):
166
+ os.unlink(file_path)
167
+ for dir_path in temp_dirs:
168
+ if os.path.exists(dir_path):
169
+ os.rmdir(dir_path)
170
+ return str(e)
171
+
172
+ # Create Gradio interface with updated theme
173
+ with gr.Blocks(
174
+ title="AI播客plus",
175
+ theme=gr.themes.Base(
176
+ primary_hue="blue",
177
+ secondary_hue="slate",
178
+ neutral_hue="slate"
179
+ ),
180
+ css="""
181
+ /* Move toggle arrow to left side */
182
+ .gr-accordion {
183
+ --accordion-arrow-size: 1.5em;
184
+ }
185
+ .gr-accordion > .label-wrap {
186
+ flex-direction: row !important;
187
+ justify-content: flex-start !important;
188
+ gap: 1em;
189
+ }
190
+ .gr-accordion > .label-wrap > .icon {
191
+ order: -1;
192
+ }
193
+ """
194
+ ) as demo:
195
+ with gr.Tab("默认环境变量已设置 Gemini、OpenAI API Key "):
196
+ # API Keys Section
197
+ with gr.Row():
198
+ gr.Markdown(
199
+ """
200
+ <h2 style='color: #2196F3; margin-bottom: 10px; padding: 10px 0;'>
201
+ 🔑 API Keys
202
+ </h2>
203
+ """,
204
+ elem_classes=["section-header"]
205
+ )
206
+ theme_btn = gr.Button("🌓", scale=0, min_width=0)
207
+ with gr.Accordion("配置 API Keys", open=False):
208
+ gemini_key = gr.Textbox(
209
+ label="Gemini API Key",
210
+ type="password",
211
+ value="",
212
+ info="必须的"
213
+ )
214
+ openai_key = gr.Textbox(
215
+ label="OpenAI API Key",
216
+ type="password",
217
+ value="",
218
+ info="只有在使用OpenAI文本转语音模型的情况下才需要此项"
219
+ )
220
+ openai_base_url = gr.Textbox(
221
+ label="OpenAI Base URL",
222
+ value="",
223
+ info="可选,留空使用默认URL:https://api.openai.com/v1"
224
+ )
225
+ elevenlabs_key = gr.Textbox(
226
+ label="ElevenLabs API Key",
227
+ type="password",
228
+ value="",
229
+ info="建议使用ElevenLabs TTS模型,仅在使用该模型时才需要此项"
230
+ )
231
+
232
+ # Content Input Section
233
+ gr.Markdown(
234
+ """
235
+ <h2 style='color: #2196F3; margin-bottom: 10px; padding: 10px 0;'>
236
+ 📝 输入内容
237
+ </h2>
238
+ """,
239
+ elem_classes=["section-header"]
240
+ )
241
+ with gr.Accordion("设置输入内容", open=False):
242
+ with gr.Group():
243
+ text_input = gr.Textbox(
244
+ label="文本输入",
245
+ placeholder="在此输入或粘贴文字...",
246
+ lines=3
247
+ )
248
+ urls_input = gr.Textbox(
249
+ label="URLs",
250
+ placeholder="请逐行输入网址,支持网站和YouTube视频链接.",
251
+ lines=3
252
+ )
253
+
254
+ # Place PDF and Image uploads side by side
255
+ with gr.Row():
256
+ with gr.Column():
257
+ pdf_files = gr.Files( # Changed from gr.File to gr.Files
258
+ label="上传 PDFs", # Updated label
259
+ file_types=[".pdf"],
260
+ type="binary"
261
+ )
262
+ gr.Markdown("*上传一个或多个PDF文件来创建播客*", elem_classes=["file-info"])
263
+
264
+ with gr.Column():
265
+ image_files = gr.Files(
266
+ label="上传图片",
267
+ file_types=["image"],
268
+ type="binary"
269
+ )
270
+ gr.Markdown("*上传一个或多个图片文件来创建播客*", elem_classes=["file-info"])
271
+
272
+ # Customization Section
273
+ gr.Markdown(
274
+ """
275
+ <h2 style='color: #2196F3; margin-bottom: 10px; padding: 10px 0;'>
276
+ ⚙️ 自定义选项
277
+ </h2>
278
+ """,
279
+ elem_classes=["section-header"]
280
+ )
281
+ with gr.Accordion("自定义选项", open=False):
282
+ # Basic Settings
283
+ gr.Markdown(
284
+ """
285
+ <h3 style='color: #1976D2; margin: 15px 0 10px 0;'>
286
+ 📊 基本设置
287
+ </h3>
288
+ """,
289
+ )
290
+ word_count = gr.Slider(
291
+ minimum=500,
292
+ maximum=5000,
293
+ value=2000,
294
+ step=100,
295
+ label="字数统计",
296
+ info="目标字数(用于生成内容)"
297
+ )
298
+
299
+ conversation_style = gr.Textbox(
300
+ label="对话风格",
301
+ value="生动活泼,节奏明快,热情洋溢",
302
+ info="用于对话的风格列表(以逗号分隔)"
303
+ )
304
+
305
+ # Roles and Structure
306
+ gr.Markdown(
307
+ """
308
+ <h3 style='color: #1976D2; margin: 15px 0 10px 0;'>
309
+ 👥 角色设定与结构安排
310
+ </h3>
311
+ """,
312
+ )
313
+ roles_person1 = gr.Textbox(
314
+ label="第一位发言者的角色",
315
+ value="主要负责总结的人",
316
+ info="在对话中,第一个说话人扮演的角色"
317
+ )
318
+
319
+ roles_person2 = gr.Textbox(
320
+ label="第二位发言者的角色",
321
+ value="提问者/释疑者",
322
+ info="在对话中,第二个说话人所扮演的角色或承担的任务"
323
+ )
324
+
325
+ dialogue_structure = gr.Textbox(
326
+ label="对话结构",
327
+ value="引言,主要内容的概括,总结",
328
+ info="对话结构的各个部分(用逗号隔开)"
329
+ )
330
+
331
+ # Podcast Identity
332
+ gr.Markdown(
333
+ """
334
+ <h3 style='color: #1976D2; margin: 15px 0 10px 0;'>
335
+ 🎙️ 播客特色
336
+ </h3>
337
+ """,
338
+ )
339
+ podcast_name = gr.Textbox(
340
+ label="播客名",
341
+ value="猛然间",
342
+ info="播客的名字"
343
+ )
344
+
345
+ podcast_tagline = gr.Textbox(
346
+ label="播客宣传语",
347
+ value="猛然回首,太匆匆",
348
+ info="播客的宣传语或副标题"
349
+ )
350
+
351
+ output_language = gr.Textbox(
352
+ label="输出语言",
353
+ value="Chinese",
354
+ info="播客使用的语言"
355
+ )
356
+
357
+ api_key_label = gr.Textbox(
358
+ label="自定义基于云的 LLM",
359
+ value="GEMINI_API_KEY",
360
+ info="可选,默认使用 Gemini,如使用 OPENAI,上面填入 'OPENAI_API_KEY' 并保证设置好环境变量且设置好下面的模型"
361
+ )
362
+
363
+ llm_model_name = gr.Textbox(
364
+ label="设置好对应自定义基于云的 LLM 模型",
365
+ value="gemini-1.5-pro-latest",
366
+ info="可选,配合上面的参数,默认是 Gemini 的 gemini-1.5-pro-latest,默认 OPENAI 可支持模型 api.168369.xyz/v1/models 获取"
367
+ )
368
+
369
+ longform = gr.Checkbox(
370
+ label="长篇模式",
371
+ value=False,
372
+ info="启用长篇内容生成模式"
373
+ )
374
+
375
+ # Voice Settings
376
+ gr.Markdown(
377
+ """
378
+ <h3 style='color: #1976D2; margin: 15px 0 10px 0;'>
379
+ 🗣️ 语音设置
380
+ </h3>
381
+ """,
382
+ )
383
+ tts_model = gr.Radio(
384
+ choices=["openai", "elevenlabs", "edge"],
385
+ value="openai",
386
+ label="文本转语音模型",
387
+ info="选择语音合成模型 (edge 免费但音质较差, 其他模型音质更好但需申请 API keys)"
388
+ )
389
+
390
+ # Advanced Settings
391
+ gr.Markdown(
392
+ """
393
+ <h3 style='color: #1976D2; margin: 15px 0 10px 0;'>
394
+ 🔧 高级选项
395
+ </h3>
396
+ """,
397
+ )
398
+ creativity_level = gr.Slider(
399
+ minimum=0,
400
+ maximum=1,
401
+ value=0.7,
402
+ step=0.1,
403
+ label="创意等级",
404
+ info="调节生成对话的创意程度(0 为注重事实,1 为更具创意)"
405
+ )
406
+
407
+ user_instructions = gr.Textbox(
408
+ label="个性化指令",
409
+ value="",
410
+ lines=2,
411
+ placeholder="在此处添加你希望AI遵循的具体指令,以控制对话的走向和内容...",
412
+ info="一些额外的指令,用来帮助AI更好地理解你想要聊天的内容和方向"
413
+ )
414
+
415
+ # Output Section
416
+ gr.Markdown(
417
+ """
418
+ <h2 style='color: #2196F3; margin-bottom: 10px; padding: 10px 0;'>
419
+ 🎵 生成结果
420
+ </h2>
421
+ """,
422
+ elem_classes=["section-header"]
423
+ )
424
+ with gr.Group():
425
+ generate_btn = gr.Button("🎙️ 生成播客", variant="primary")
426
+ audio_output = gr.Audio(
427
+ type="filepath",
428
+ label="生成的播客"
429
+ )
430
+
431
+ # Handle generation
432
+ generate_btn.click(
433
+ process_inputs,
434
+ inputs=[
435
+ text_input, urls_input, pdf_files, image_files,
436
+ gemini_key, openai_key, openai_base_url,
437
+ elevenlabs_key,
438
+ word_count, conversation_style,
439
+ roles_person1, roles_person2,
440
+ dialogue_structure, podcast_name,
441
+ podcast_tagline, output_language, tts_model,
442
+ creativity_level, user_instructions,
443
+ api_key_label, llm_model_name, longform
444
+ ],
445
+ outputs=audio_output
446
+ )
447
+
448
+ # Add theme toggle functionality
449
+ theme_btn.click(
450
+ None,
451
+ None,
452
+ None,
453
+ js="""
454
+ function() {
455
+ document.querySelector('body').classList.toggle('dark');
456
+ return [];
457
+ }
458
+ """
459
+ )
460
+
461
+ if __name__ == "__main__":
462
+ demo.queue().launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio-client
2
+ gradio
3
+ podcastfy
4
+ python-dotenv