Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import torch
|
5 |
+
import torchaudio
|
6 |
+
import google.generativeai as genai
|
7 |
+
from transformers import AutoProcessor, AutoModel
|
8 |
+
import numpy as np
|
9 |
+
import os
|
10 |
+
import json
|
11 |
+
|
12 |
+
# Initialize Gemini AI
|
13 |
+
genai.configure(api_key='YOUR_GEMINI_API_KEY')
|
14 |
+
model = genai.GenerativeModel('gemini-pro')
|
15 |
+
|
16 |
+
# Initialize F5-TTS model
|
17 |
+
processor = AutoProcessor.from_pretrained("SWivid/F5-TTS")
|
18 |
+
f5_model = AutoModel.from_pretrained("SWivid/F5-TTS")
|
19 |
+
|
20 |
+
def clone_voice(audio_file):
|
21 |
+
waveform, sample_rate = torchaudio.load(audio_file)
|
22 |
+
# Implement voice cloning logic here
|
23 |
+
# For demonstration, we'll just return a placeholder
|
24 |
+
return "Voice cloned successfully"
|
25 |
+
|
26 |
+
def generate_podcast_script(content, duration):
|
27 |
+
prompt = f"""
|
28 |
+
Create a podcast script for two people discussing the following content:
|
29 |
+
{content}
|
30 |
+
|
31 |
+
The podcast should last approximately {duration}. Include natural speech patterns,
|
32 |
+
humor, and occasional off-topic chit-chat. Use speech fillers like "um", "ah",
|
33 |
+
"yes", "I see", "Ok now". Vary the emotional tone (e.g., regular, happy, sad, surprised)
|
34 |
+
and indicate these in [square brackets]. Format the script as follows:
|
35 |
+
|
36 |
+
Host 1: [emotion] Dialog
|
37 |
+
Host 2: [emotion] Dialog
|
38 |
+
|
39 |
+
Ensure the conversation flows naturally and stays relevant to the topic.
|
40 |
+
"""
|
41 |
+
response = model.generate_content(prompt)
|
42 |
+
return response.text
|
43 |
+
|
44 |
+
def text_to_speech(text, speaker_id):
|
45 |
+
inputs = processor(text=text, return_tensors="pt")
|
46 |
+
speech = f5_model.generate_speech(inputs["input_ids"], speaker_id=speaker_id, vocoder_output=True)
|
47 |
+
return speech.cpu().numpy()
|
48 |
+
|
49 |
+
def create_podcast(content, duration, voice1, voice2):
|
50 |
+
script = generate_podcast_script(content, duration)
|
51 |
+
lines = script.split('\n')
|
52 |
+
audio_segments = []
|
53 |
+
|
54 |
+
for line in lines:
|
55 |
+
if line.startswith("Host 1:"):
|
56 |
+
audio = text_to_speech(line[7:], speaker_id=0) # Assuming speaker_id 0 for Host 1
|
57 |
+
audio_segments.append(audio)
|
58 |
+
elif line.startswith("Host 2:"):
|
59 |
+
audio = text_to_speech(line[7:], speaker_id=1) # Assuming speaker_id 1 for Host 2
|
60 |
+
audio_segments.append(audio)
|
61 |
+
|
62 |
+
# Concatenate audio segments
|
63 |
+
podcast_audio = np.concatenate(audio_segments)
|
64 |
+
return (22050, podcast_audio) # Assuming 22050 Hz sample rate
|
65 |
+
|
66 |
+
def gradio_interface(content, duration, voice1, voice2):
|
67 |
+
script = generate_podcast_script(content, duration)
|
68 |
+
return script
|
69 |
+
|
70 |
+
def render_podcast(script, voice1, voice2):
|
71 |
+
lines = script.split('\n')
|
72 |
+
audio_segments = []
|
73 |
+
|
74 |
+
for line in lines:
|
75 |
+
if line.startswith("Host 1:"):
|
76 |
+
audio = text_to_speech(line[7:], speaker_id=0)
|
77 |
+
audio_segments.append(audio)
|
78 |
+
elif line.startswith("Host 2:"):
|
79 |
+
audio = text_to_speech(line[7:], speaker_id=1)
|
80 |
+
audio_segments.append(audio)
|
81 |
+
|
82 |
+
podcast_audio = np.concatenate(audio_segments)
|
83 |
+
return (22050, podcast_audio)
|
84 |
+
|
85 |
+
# Gradio Interface
|
86 |
+
with gr.Blocks() as demo:
|
87 |
+
gr.Markdown("# AI Podcast Generator")
|
88 |
+
|
89 |
+
with gr.Row():
|
90 |
+
content_input = gr.Textbox(label="Paste your content or upload a document")
|
91 |
+
document_upload = gr.File(label="Upload Document")
|
92 |
+
|
93 |
+
duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
|
94 |
+
|
95 |
+
with gr.Row():
|
96 |
+
voice1_upload = gr.Audio(label="Upload Voice 1", type="filepath")
|
97 |
+
voice2_upload = gr.Audio(label="Upload Voice 2", type="filepath")
|
98 |
+
|
99 |
+
generate_btn = gr.Button("Generate Script")
|
100 |
+
script_output = gr.Textbox(label="Generated Script", lines=10)
|
101 |
+
|
102 |
+
render_btn = gr.Button("Render Podcast")
|
103 |
+
audio_output = gr.Audio(label="Generated Podcast")
|
104 |
+
|
105 |
+
generate_btn.click(gradio_interface, inputs=[content_input, duration, voice1_upload, voice2_upload], outputs=script_output)
|
106 |
+
render_btn.click(render_podcast, inputs=[script_output, voice1_upload, voice2_upload], outputs=audio_output)
|
107 |
+
|
108 |
+
demo.launch()
|