bluenevus's picture
Create app.py
74245b5 verified
raw
history blame
3.96 kB
# app.py
import gradio as gr
import torch
import torchaudio
import google.generativeai as genai
from transformers import AutoProcessor, AutoModel
import numpy as np
import os
import json
# Initialize Gemini AI
genai.configure(api_key='YOUR_GEMINI_API_KEY')
model = genai.GenerativeModel('gemini-pro')
# Initialize F5-TTS model
processor = AutoProcessor.from_pretrained("SWivid/F5-TTS")
f5_model = AutoModel.from_pretrained("SWivid/F5-TTS")
def clone_voice(audio_file):
waveform, sample_rate = torchaudio.load(audio_file)
# Implement voice cloning logic here
# For demonstration, we'll just return a placeholder
return "Voice cloned successfully"
def generate_podcast_script(content, duration):
prompt = f"""
Create a podcast script for two people discussing the following content:
{content}
The podcast should last approximately {duration}. Include natural speech patterns,
humor, and occasional off-topic chit-chat. Use speech fillers like "um", "ah",
"yes", "I see", "Ok now". Vary the emotional tone (e.g., regular, happy, sad, surprised)
and indicate these in [square brackets]. Format the script as follows:
Host 1: [emotion] Dialog
Host 2: [emotion] Dialog
Ensure the conversation flows naturally and stays relevant to the topic.
"""
response = model.generate_content(prompt)
return response.text
def text_to_speech(text, speaker_id):
inputs = processor(text=text, return_tensors="pt")
speech = f5_model.generate_speech(inputs["input_ids"], speaker_id=speaker_id, vocoder_output=True)
return speech.cpu().numpy()
def create_podcast(content, duration, voice1, voice2):
script = generate_podcast_script(content, duration)
lines = script.split('\n')
audio_segments = []
for line in lines:
if line.startswith("Host 1:"):
audio = text_to_speech(line[7:], speaker_id=0) # Assuming speaker_id 0 for Host 1
audio_segments.append(audio)
elif line.startswith("Host 2:"):
audio = text_to_speech(line[7:], speaker_id=1) # Assuming speaker_id 1 for Host 2
audio_segments.append(audio)
# Concatenate audio segments
podcast_audio = np.concatenate(audio_segments)
return (22050, podcast_audio) # Assuming 22050 Hz sample rate
def gradio_interface(content, duration, voice1, voice2):
script = generate_podcast_script(content, duration)
return script
def render_podcast(script, voice1, voice2):
lines = script.split('\n')
audio_segments = []
for line in lines:
if line.startswith("Host 1:"):
audio = text_to_speech(line[7:], speaker_id=0)
audio_segments.append(audio)
elif line.startswith("Host 2:"):
audio = text_to_speech(line[7:], speaker_id=1)
audio_segments.append(audio)
podcast_audio = np.concatenate(audio_segments)
return (22050, podcast_audio)
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# AI Podcast Generator")
with gr.Row():
content_input = gr.Textbox(label="Paste your content or upload a document")
document_upload = gr.File(label="Upload Document")
duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
with gr.Row():
voice1_upload = gr.Audio(label="Upload Voice 1", type="filepath")
voice2_upload = gr.Audio(label="Upload Voice 2", type="filepath")
generate_btn = gr.Button("Generate Script")
script_output = gr.Textbox(label="Generated Script", lines=10)
render_btn = gr.Button("Render Podcast")
audio_output = gr.Audio(label="Generated Podcast")
generate_btn.click(gradio_interface, inputs=[content_input, duration, voice1_upload, voice2_upload], outputs=script_output)
render_btn.click(render_podcast, inputs=[script_output, voice1_upload, voice2_upload], outputs=audio_output)
demo.launch()