File size: 7,723 Bytes
74245b5 b8d465b 97778aa aa10e55 97778aa 031a7f3 b8d465b 48ade8d 896ca54 310ebea 07cb903 5753bc2 b8d465b 310ebea aa10e55 310ebea 54be90e 21f3e87 54be90e 97778aa 54be90e 5753bc2 b66d512 896ca54 3c8678d 1668d21 5753bc2 1668d21 3c8678d 5753bc2 b8d465b 896ca54 97778aa 1668d21 5753bc2 aa10e55 5753bc2 aa10e55 5753bc2 97778aa 896ca54 97778aa 5753bc2 aa10e55 5753bc2 310ebea 5753bc2 b8d465b 5753bc2 b8d465b d982b19 ca79387 21f3e87 b8d465b 21f3e87 5753bc2 21f3e87 b8d465b 21f3e87 d982b19 21f3e87 74245b5 a727789 5753bc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import gradio as gr
import google.generativeai as genai
import numpy as np
import re
import torch
import torchaudio
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download, login
import logging
import os
import spaces
import warnings
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
def get_device():
if torch.cuda.is_available():
return torch.device("cuda")
return torch.device("cpu")
device = get_device()
logger.info(f"Using device: {device}")
model = None
tokenizer = None
@spaces.GPU()
def load_model():
global model, tokenizer
logger.info("Loading Orpheus model...")
model_name = "canopylabs/orpheus-3b-0.1-ft"
hf_token = os.environ.get("HUGGINGFACE_TOKEN")
if not hf_token:
raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
try:
login(token=hf_token)
snapshot_download(
repo_id=model_name,
use_auth_token=hf_token,
allow_patterns=[
"config.json",
"*.safetensors",
"model.safetensors.index.json",
],
ignore_patterns=[
"optimizer.pt",
"pytorch_model.bin",
"training_args.bin",
"scheduler.pt",
"tokenizer.json",
"tokenizer_config.json",
"special_tokens_map.json",
"vocab.json",
"merges.txt",
"tokenizer.*"
]
)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32 if device.type == 'cpu' else torch.bfloat16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
logger.info(f"Orpheus model and tokenizer loaded to {device}")
except Exception as e:
logger.error(f"Error loading model: {str(e)}")
raise
def generate_podcast_script(api_key, content, uploaded_file, duration, num_hosts):
try:
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
combined_content = content or ""
if uploaded_file:
file_content = uploaded_file.read().decode('utf-8')
combined_content += "\n" + file_content if combined_content else file_content
prompt = f"""
Create a podcast script for {'one person' if num_hosts == 1 else 'two people'} discussing:
{combined_content}
Duration: {duration}. Include natural speech, humor, and occasional off-topic thoughts.
Use speech fillers like um, ah. Vary emotional tone.
Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
Use emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
Ensure content flows naturally and stays on topic. Match the script length to {duration}.
"""
response = model.generate_content(prompt)
return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
except Exception as e:
logger.error(f"Error generating podcast script: {str(e)}")
raise
@spaces.GPU()
def text_to_speech(text, voice):
global model, tokenizer
try:
if model is None or tokenizer is None:
load_model()
# Remove emotion tags for TTS processing
clean_text = re.sub(r'<[^>]+>', '', text)
inputs = tokenizer(clean_text, return_tensors="pt").to(device)
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=256)
# Convert output tensor to mel spectrogram
mel = output[0].cpu()
# Normalize the mel spectrogram
mel = (mel - mel.min()) / (mel.max() - mel.min())
# Convert mel spectrogram to audio using torchaudio
griffin_lim = torchaudio.transforms.GriffinLim(n_fft=2048, n_iter=10)
audio = griffin_lim(mel.unsqueeze(0))
# Convert to numpy array and ensure it's in the correct format
audio_np = audio.squeeze().numpy()
audio_np = np.clip(audio_np, -1, 1)
return (24000, audio_np.astype(np.float32)) # Assuming 24kHz sample rate
except Exception as e:
logger.error(f"Error in text_to_speech: {str(e)}")
raise
@spaces.GPU()
def render_podcast(api_key, script, voice1, voice2, num_hosts):
try:
lines = [line for line in script.split('\n') if line.strip()]
audio_segments = []
for i, line in enumerate(lines):
voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
try:
_, audio = text_to_speech(line, voice)
audio_segments.append(audio)
except Exception as e:
logger.error(f"Error processing audio segment: {str(e)}")
if not audio_segments:
logger.warning("No valid audio segments were generated.")
return (24000, np.zeros(24000, dtype=np.float32))
podcast_audio = np.concatenate(audio_segments)
# Ensure the audio is in the correct format for Gradio
podcast_audio = np.clip(podcast_audio, -1, 1)
podcast_audio = (podcast_audio * 32767).astype(np.int16)
return (24000, podcast_audio)
except Exception as e:
logger.error(f"Error rendering podcast: {str(e)}")
raise
with gr.Blocks() as demo:
gr.Markdown("# AI Podcast Generator")
api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
with gr.Row():
content_input = gr.Textbox(label="Paste your content (optional)")
document_upload = gr.File(label="Upload Document (optional)")
duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
voice_options = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_options, value="tara")
voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_options, value="leo")
generate_btn = gr.Button("Generate Script")
script_output = gr.Textbox(label="Generated Script", lines=10)
render_btn = gr.Button("Render Podcast")
audio_output = gr.Audio(label="Generated Podcast")
generate_btn.click(generate_podcast_script,
inputs=[api_key_input, content_input, document_upload, duration, num_hosts],
outputs=script_output)
render_btn.click(render_podcast,
inputs=[api_key_input, script_output, voice1_select, voice2_select, num_hosts],
outputs=audio_output)
num_hosts.change(lambda x: gr.update(visible=x == 2),
inputs=[num_hosts],
outputs=[voice2_select])
if __name__ == "__main__":
try:
load_model()
demo.launch()
except Exception as e:
logger.error(f"Error launching the application: {str(e)}") |