File size: 8,865 Bytes
74245b5 b8d465b 38f82cf b8d465b 97778aa 031a7f3 97778aa 3c8678d b8d465b 07cb903 b8d465b 97778aa 1668d21 3c8678d 1668d21 3c8678d 1668d21 07cb903 3c8678d d982b19 b8d465b d982b19 5ba3e1d d982b19 5ba3e1d d982b19 5ba3e1d 07cb903 d982b19 07cb903 d982b19 5ba3e1d 07cb903 5ba3e1d 07cb903 5ba3e1d 07cb903 d982b19 5ba3e1d 07cb903 d982b19 b8d465b 07cb903 b8d465b 97778aa 1668d21 07cb903 97778aa 07cb903 97778aa 2148d28 07cb903 2148d28 97778aa 07cb903 1e7b36b 97778aa 72cdd21 b8d465b 97778aa 07cb903 97778aa b8d465b 97778aa b8d465b d982b19 ca79387 b8d465b ca79387 b8d465b ca79387 b8d465b d982b19 b8d465b 97778aa b8d465b d982b19 97778aa d982b19 97778aa 74245b5 a727789 07cb903 b8d465b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
import gradio as gr
import google.generativeai as genai
import numpy as np
import io
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download, login
import torchaudio
from torchaudio.functional import resample
import threading
import queue
import os
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = None
tokenizer = None
def load_model():
global model, tokenizer
print("Loading Orpheus model...")
model_name = "canopylabs/orpheus-3b-0.1-ft"
hf_token = os.environ.get("HUGGINGFACE_TOKEN")
if not hf_token:
raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
login(token=hf_token)
snapshot_download(
repo_id=model_name,
use_auth_token=hf_token,
allow_patterns=[
"config.json",
"*.safetensors",
"model.safetensors.index.json",
],
ignore_patterns=[
"optimizer.pt",
"pytorch_model.bin",
"training_args.bin",
"scheduler.pt",
"tokenizer.json",
"tokenizer_config.json",
"special_tokens_map.json",
"vocab.json",
"merges.txt",
"tokenizer.*"
]
)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Orpheus model and tokenizer loaded to {device}")
def generate_podcast_script(api_key, content, duration, num_hosts):
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
if num_hosts == 1:
prompt = f"""
Create a podcast script for one person discussing the following content:
{content}
The podcast should last approximately {duration}. Include natural speech patterns,
humor, and occasional off-topic thoughts. Use occasional speech fillers like um, ah,
yes, I see, Ok now. Vary the emotional tone.
Format the script as a monologue without speaker labels.
Separate each paragraph with a blank line.
Only include the monologue with proper punctuation and emotion tags enclosed in angle brackets < >.
For example, use <chuckle> instead of "chuckle".
Ensure the content flows naturally and stays relevant to the topic.
Limit the script length to match the requested duration of {duration}.
To use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments
to enhance the dialogue's emotional context.
Always place tags like <laugh> for joy, <sigh> for frustration or relief, <chuckle> for mild amusement,
<cough> or <sniffle> for discomfort, <groan> for displeasure, <yawn> for tiredness, and <gasp> for surprise.
For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
Oh well, at least I finished the project <chuckle>."
Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text
"""
else:
prompt = f"""
Create a podcast script for two people discussing the following content:
{content}
The podcast should last approximately {duration}. Include natural speech patterns,
humor, and occasional off-topic chit-chat. Use occasional speech fillers like um, ah,
yes, I see, Ok now. Vary the emotional tone.
Format the script as alternating lines of dialogue without speaker labels.
Separate each line with a blank line.
Only include the dialogue with proper punctuation and emotion tags enclosed in angle brackets < >.
For example, use <chuckle> instead of "chuckle".
Ensure the conversation flows naturally and stays relevant to the topic.
Limit the script length to match the requested duration of {duration}.
To use emotion tags naturally in generative AI speech, incorporate them sparingly at key moments
to enhance the dialogue's emotional context.
Always place tags like <laugh> for joy, <sigh> for frustration or relief, <chuckle> for mild amusement,
<cough> or <sniffle> for discomfort, <groan> for displeasure, <yawn> for tiredness, and <gasp> for surprise.
For example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>.
Oh well, at least I finished the project <chuckle>."
Important: Ensure all emotion tags are properly enclosed in angle brackets < > to distinguish them from regular text
"""
response = model.generate_content(prompt)
clean_text = re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
return clean_text
def text_to_speech(text, voice):
global model, tokenizer
if tokenizer is None or model is None:
raise ValueError("Model or tokenizer not initialized. Please call load_model() first.")
inputs = tokenizer(text, return_tensors="pt").to(device)
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=256)
mel = output[0].cpu().numpy()
audio = mel_to_audio(mel)
return audio
def mel_to_audio(mel):
# Placeholder implementation
return np.zeros(24000, dtype=np.float32) # 1 second of silence
def process_audio_segment(line, voice, result_queue):
try:
audio = text_to_speech(line, voice)
result_queue.put(audio)
except Exception as e:
logger.error(f"Error processing audio segment: {str(e)}")
result_queue.put(None)
def render_podcast(api_key, script, voice1, voice2, num_hosts):
lines = [line for line in script.split('\n') if line.strip()]
audio_segments = []
threads = []
result_queue = queue.Queue()
for i, line in enumerate(lines):
voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
thread = threading.Thread(target=process_audio_segment, args=(line, voice, result_queue))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
while not result_queue.empty():
segment = result_queue.get()
if segment is not None:
audio_segments.append(segment)
if not audio_segments:
logger.warning("No valid audio segments were generated.")
return (24000, np.zeros(24000, dtype=np.float32))
podcast_audio = np.concatenate(audio_segments)
return (24000, podcast_audio)
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# AI Podcast Generator")
api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
with gr.Row():
content_input = gr.Textbox(label="Paste your content or upload a document")
document_upload = gr.File(label="Upload Document")
duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
voice_options = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
with gr.Row():
voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_options, value="tara")
with gr.Row():
voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_options, value="leo")
generate_btn = gr.Button("Generate Script")
script_output = gr.Textbox(label="Generated Script", lines=10)
render_btn = gr.Button("Render Podcast")
audio_output = gr.Audio(label="Generated Podcast")
def generate_script_wrapper(api_key, content, duration, num_hosts):
return generate_podcast_script(api_key, content, duration, num_hosts)
def render_podcast_wrapper(api_key, script, voice1, voice2, num_hosts):
return render_podcast(api_key, script, voice1, voice2, num_hosts)
generate_btn.click(generate_script_wrapper, inputs=[api_key_input, content_input, duration, num_hosts], outputs=script_output)
render_btn.click(render_podcast_wrapper, inputs=[api_key_input, script_output, voice1_select, voice2_select, num_hosts], outputs=audio_output)
def update_second_voice_visibility(num_hosts):
return gr.update(visible=num_hosts == 2)
num_hosts.change(update_second_voice_visibility, inputs=[num_hosts], outputs=[voice2_select])
if __name__ == "__main__":
load_model() # Ensure the model is loaded before launching the interface
demo.launch() |