pod / app.py
deeme's picture
Upload 2 files
54dd079 verified
raw
history blame
17.1 kB
import gradio as gr
import os
import tempfile
import logging
from podcastfy.client import generate_podcast
from dotenv import load_dotenv
# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# Load environment variables
load_dotenv()
def get_api_key(key_name, ui_value):
return ui_value if ui_value else os.getenv(key_name)
def process_inputs(
text_input,
urls_input,
pdf_files,
image_files,
gemini_key,
openai_key,
openai_base_url, # 新增参数
elevenlabs_key,
word_count,
conversation_style,
roles_person1,
roles_person2,
dialogue_structure,
podcast_name,
podcast_tagline,
output_language,
tts_model,
creativity_level,
user_instructions,
api_key_label,
llm_model_name,
longform,
):
try:
logger.info("Starting podcast generation process")
# API key handling
logger.debug("Setting API keys")
os.environ["GEMINI_API_KEY"] = get_api_key("GEMINI_API_KEY", gemini_key)
if tts_model == "openai":
logger.debug("Setting OpenAI API key")
if not openai_key and not os.getenv("OPENAI_API_KEY"):
raise ValueError("OpenAI API key is required when using OpenAI TTS model")
os.environ["OPENAI_API_KEY"] = get_api_key("OPENAI_API_KEY", openai_key)
if openai_base_url:
os.environ["OPENAI_API_BASE"] = openai_base_url
if tts_model == "elevenlabs":
logger.debug("Setting ElevenLabs API key")
if not elevenlabs_key and not os.getenv("ELEVENLABS_API_KEY"):
raise ValueError("ElevenLabs API key is required when using ElevenLabs TTS model")
os.environ["ELEVENLABS_API_KEY"] = get_api_key("ELEVENLABS_API_KEY", elevenlabs_key)
# Process URLs
urls = [url.strip() for url in urls_input.split('\n') if url.strip()]
logger.debug(f"Processed URLs: {urls}")
temp_files = []
temp_dirs = []
# Handle PDF files
if pdf_files is not None and len(pdf_files) > 0:
logger.info(f"Processing {len(pdf_files)} PDF files")
pdf_temp_dir = tempfile.mkdtemp()
temp_dirs.append(pdf_temp_dir)
for i, pdf_file in enumerate(pdf_files):
pdf_path = os.path.join(pdf_temp_dir, f"input_pdf_{i}.pdf")
temp_files.append(pdf_path)
with open(pdf_path, 'wb') as f:
f.write(pdf_file)
urls.append(pdf_path)
logger.debug(f"Saved PDF {i} to {pdf_path}")
# Handle image files
image_paths = []
if image_files is not None and len(image_files) > 0:
logger.info(f"Processing {len(image_files)} image files")
img_temp_dir = tempfile.mkdtemp()
temp_dirs.append(img_temp_dir)
for i, img_file in enumerate(image_files):
# Get file extension from the original name in the file tuple
original_name = img_file.orig_name if hasattr(img_file, 'orig_name') else f"image_{i}.jpg"
extension = original_name.split('.')[-1]
logger.debug(f"Processing image file {i}: {original_name}")
img_path = os.path.join(img_temp_dir, f"input_image_{i}.{extension}")
temp_files.append(img_path)
try:
# Write the bytes directly to the file
with open(img_path, 'wb') as f:
if isinstance(img_file, (tuple, list)):
f.write(img_file[1]) # Write the bytes content
else:
f.write(img_file) # Write the bytes directly
image_paths.append(img_path)
logger.debug(f"Saved image {i} to {img_path}")
except Exception as e:
logger.error(f"Error saving image {i}: {str(e)}")
raise
# Prepare conversation config
logger.debug("Preparing conversation config")
conversation_config = {
"word_count": word_count,
"conversation_style": conversation_style.split(','),
"roles_person1": roles_person1,
"roles_person2": roles_person2,
"dialogue_structure": dialogue_structure.split(','),
"podcast_name": podcast_name,
"podcast_tagline": podcast_tagline,
"output_language": output_language,
"creativity": creativity_level,
"user_instructions": user_instructions,
"api_key_label": api_key_label,
"llm_model_name": llm_model_name,
"longform": longform,
}
# Generate podcast
logger.info("Calling generate_podcast function")
logger.debug(f"URLs: {urls}")
logger.debug(f"Image paths: {image_paths}")
logger.debug(f"Text input present: {'Yes' if text_input else 'No'}")
audio_file = generate_podcast(
urls=urls if urls else None,
text=text_input if text_input else None,
image_paths=image_paths if image_paths else None,
tts_model=tts_model,
conversation_config=conversation_config
)
logger.info("Podcast generation completed")
# Cleanup
logger.debug("Cleaning up temporary files")
for file_path in temp_files:
if os.path.exists(file_path):
os.unlink(file_path)
logger.debug(f"Removed temp file: {file_path}")
for dir_path in temp_dirs:
if os.path.exists(dir_path):
os.rmdir(dir_path)
logger.debug(f"Removed temp directory: {dir_path}")
return audio_file
except Exception as e:
logger.error(f"Error in process_inputs: {str(e)}", exc_info=True)
# Cleanup on error
for file_path in temp_files:
if os.path.exists(file_path):
os.unlink(file_path)
for dir_path in temp_dirs:
if os.path.exists(dir_path):
os.rmdir(dir_path)
return str(e)
# Create Gradio interface with updated theme
with gr.Blocks(
title="AI播客plus",
theme=gr.themes.Base(
primary_hue="blue",
secondary_hue="slate",
neutral_hue="slate"
),
css="""
/* Move toggle arrow to left side */
.gr-accordion {
--accordion-arrow-size: 1.5em;
}
.gr-accordion > .label-wrap {
flex-direction: row !important;
justify-content: flex-start !important;
gap: 1em;
}
.gr-accordion > .label-wrap > .icon {
order: -1;
}
"""
) as demo:
with gr.Tab("默认环境变量已设置 Gemini、OpenAI API Key "):
# API Keys Section
with gr.Row():
gr.Markdown(
"""
<h2 style='color: #2196F3; margin-bottom: 10px; padding: 10px 0;'>
🔑 API Keys
</h2>
""",
elem_classes=["section-header"]
)
theme_btn = gr.Button("🌓", scale=0, min_width=0)
with gr.Accordion("配置 API Keys", open=False):
gemini_key = gr.Textbox(
label="Gemini API Key",
type="password",
value="",
info="必须的"
)
openai_key = gr.Textbox(
label="OpenAI API Key",
type="password",
value="",
info="只有在使用OpenAI文本转语音模型的情况下才需要此项"
)
openai_base_url = gr.Textbox(
label="OpenAI Base URL",
value="",
info="可选,留空使用默认URL:https://api.openai.com/v1"
)
elevenlabs_key = gr.Textbox(
label="ElevenLabs API Key",
type="password",
value="",
info="建议使用ElevenLabs TTS模型,仅在使用该模型时才需要此项"
)
# Content Input Section
gr.Markdown(
"""
<h2 style='color: #2196F3; margin-bottom: 10px; padding: 10px 0;'>
📝 输入内容
</h2>
""",
elem_classes=["section-header"]
)
with gr.Accordion("设置输入内容", open=False):
with gr.Group():
text_input = gr.Textbox(
label="文本输入",
placeholder="在此输入或粘贴文字...",
lines=3
)
urls_input = gr.Textbox(
label="URLs",
placeholder="请逐行输入网址,支持网站和YouTube视频链接.",
lines=3
)
# Place PDF and Image uploads side by side
with gr.Row():
with gr.Column():
pdf_files = gr.Files( # Changed from gr.File to gr.Files
label="上传 PDFs", # Updated label
file_types=[".pdf"],
type="binary"
)
gr.Markdown("*上传一个或多个PDF文件来创建播客*", elem_classes=["file-info"])
with gr.Column():
image_files = gr.Files(
label="上传图片",
file_types=["image"],
type="binary"
)
gr.Markdown("*上传一个或多个图片文件来创建播客*", elem_classes=["file-info"])
# Customization Section
gr.Markdown(
"""
<h2 style='color: #2196F3; margin-bottom: 10px; padding: 10px 0;'>
⚙️ 自定义选项
</h2>
""",
elem_classes=["section-header"]
)
with gr.Accordion("自定义选项", open=False):
# Basic Settings
gr.Markdown(
"""
<h3 style='color: #1976D2; margin: 15px 0 10px 0;'>
📊 基本设置
</h3>
""",
)
word_count = gr.Slider(
minimum=500,
maximum=5000,
value=2000,
step=100,
label="字数统计",
info="目标字数(用于生成内容)"
)
conversation_style = gr.Textbox(
label="对话风格",
value="生动活泼,节奏明快,热情洋溢",
info="用于对话的风格列表(以逗号分隔)"
)
# Roles and Structure
gr.Markdown(
"""
<h3 style='color: #1976D2; margin: 15px 0 10px 0;'>
👥 角色设定与结构安排
</h3>
""",
)
roles_person1 = gr.Textbox(
label="第一位发言者的角色",
value="主要负责总结的人",
info="在对话中,第一个说话人扮演的角色"
)
roles_person2 = gr.Textbox(
label="第二位发言者的角色",
value="提问者/释疑者",
info="在对话中,第二个说话人所扮演的角色或承担的任务"
)
dialogue_structure = gr.Textbox(
label="对话结构",
value="引言,主要内容的概括,总结",
info="对话结构的各个部分(用逗号隔开)"
)
# Podcast Identity
gr.Markdown(
"""
<h3 style='color: #1976D2; margin: 15px 0 10px 0;'>
🎙️ 播客特色
</h3>
""",
)
podcast_name = gr.Textbox(
label="播客名",
value="猛然间",
info="播客的名字"
)
podcast_tagline = gr.Textbox(
label="播客宣传语",
value="猛然回首,太匆匆",
info="播客的宣传语或副标题"
)
output_language = gr.Textbox(
label="输出语言",
value="Chinese",
info="播客使用的语言"
)
api_key_label = gr.Textbox(
label="自定义基于云的 LLM",
value="GEMINI_API_KEY",
info="可选,默认使用 Gemini,如使用 OPENAI,上面填入 'OPENAI_API_KEY' 并保证设置好环境变量且设置好下面的模型"
)
llm_model_name = gr.Textbox(
label="设置好对应自定义基于云的 LLM 模型",
value="gemini-1.5-pro-latest",
info="可选,配合上面的参数,默认是 Gemini 的 gemini-1.5-pro-latest,默认 OPENAI 可支持模型 api.168369.xyz/v1/models 获取"
)
longform = gr.Checkbox(
label="长篇模式",
value=False,
info="启用长篇内容生成模式"
)
# Voice Settings
gr.Markdown(
"""
<h3 style='color: #1976D2; margin: 15px 0 10px 0;'>
🗣️ 语音设置
</h3>
""",
)
tts_model = gr.Radio(
choices=["openai", "elevenlabs", "edge"],
value="openai",
label="文本转语音模型",
info="选择语音合成模型 (edge 免费但音质较差, 其他模型音质更好但需申请 API keys)"
)
# Advanced Settings
gr.Markdown(
"""
<h3 style='color: #1976D2; margin: 15px 0 10px 0;'>
🔧 高级选项
</h3>
""",
)
creativity_level = gr.Slider(
minimum=0,
maximum=1,
value=0.7,
step=0.1,
label="创意等级",
info="调节生成对话的创意程度(0 为注重事实,1 为更具创意)"
)
user_instructions = gr.Textbox(
label="个性化指令",
value="",
lines=2,
placeholder="在此处添加你希望AI遵循的具体指令,以控制对话的走向和内容...",
info="一些额外的指令,用来帮助AI更好地理解你想要聊天的内容和方向"
)
# Output Section
gr.Markdown(
"""
<h2 style='color: #2196F3; margin-bottom: 10px; padding: 10px 0;'>
🎵 生成结果
</h2>
""",
elem_classes=["section-header"]
)
with gr.Group():
generate_btn = gr.Button("🎙️ 生成播客", variant="primary")
audio_output = gr.Audio(
type="filepath",
label="生成的播客"
)
# Handle generation
generate_btn.click(
process_inputs,
inputs=[
text_input, urls_input, pdf_files, image_files,
gemini_key, openai_key, openai_base_url,
elevenlabs_key,
word_count, conversation_style,
roles_person1, roles_person2,
dialogue_structure, podcast_name,
podcast_tagline, output_language, tts_model,
creativity_level, user_instructions,
api_key_label, llm_model_name, longform
],
outputs=audio_output
)
# Add theme toggle functionality
theme_btn.click(
None,
None,
None,
js="""
function() {
document.querySelector('body').classList.toggle('dark');
return [];
}
"""
)
if __name__ == "__main__":
demo.queue().launch(share=True)