Spaces:

MicroHealth
/

ai-podcast-builder

Paused

App Files Files Community

bluenevus commited on Apr 15

Commit

db5919c

verified ·

1 Parent(s): c10cafd

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -35

app.py CHANGED Viewed

@@ -3,14 +3,16 @@ import google.generativeai as genai
 import numpy as np
 import re
 import torch
-import torchaudio
-import torchaudio.functional as F
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download, login
 import logging
 import os
 import spaces
 import warnings
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
@@ -21,19 +23,22 @@ warnings.filterwarnings("ignore", category=UserWarning)
 warnings.filterwarnings("ignore", category=RuntimeWarning)
 def get_device():
-    if torch.cuda.is_available():
-        return torch.device("cuda")
-    return torch.device("cpu")
 device = get_device()
 logger.info(f"Using device: {device}")
 model = None
 tokenizer = None
 @spaces.GPU()
 def load_model():
-    global model, tokenizer
     logger.info("Loading Orpheus model...")
     model_name = "canopylabs/orpheus-3b-0.1-ft"
@@ -67,7 +72,7 @@ def load_model():
             ]
         )
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32 if device.type == 'cpu' else torch.bfloat16)
         model.to(device)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         logger.info(f"Orpheus model and tokenizer loaded to {device}")
@@ -108,42 +113,102 @@ def generate_podcast_script(api_key, content, uploaded_file, duration, num_hosts
         logger.error(f"Error generating podcast script: {str(e)}")
         raise
 @spaces.GPU()
-def text_to_speech(text, voice):
-    global model, tokenizer
     try:
-        if model is None or tokenizer is None:
-            load_model()
-        # Remove emotion tags for TTS processing
-        clean_text = re.sub(r'<[^>]+>', '', text)
-        inputs = tokenizer(clean_text, return_tensors="pt").to(device)
         with torch.no_grad():
-            output = model.generate(**inputs, max_new_tokens=256)
-        # Convert output tensor to mel spectrogram
-        mel = output[0].cpu()
-        # Reshape mel to match expected dimensions
-        n_mels = 80  # Typical number of mel bands
-        time_dim = mel.shape[0]
-        mel_reshaped = mel.view(n_mels, -1)
-        # Normalize the mel spectrogram
-        mel_reshaped = (mel_reshaped - mel_reshaped.min()) / (mel_reshaped.max() - mel_reshaped.min())
-        # Convert mel spectrogram to audio using torchaudio
-        audio = F.griffinlim(mel_reshaped.unsqueeze(0), n_iter=10, n_fft=2048, hop_length=512, win_length=2048)
-        # Convert to numpy array and ensure it's in the correct format
-        audio_np = audio.squeeze().numpy()
-        audio_np = np.clip(audio_np, -1, 1)
-        return (24000, audio_np.astype(np.float32))  # Assuming 24kHz sample rate
     except Exception as e:
         logger.error(f"Error in text_to_speech: {str(e)}")
         raise
 @spaces.GPU()
 def render_podcast(api_key, script, voice1, voice2, num_hosts):
     try:
@@ -153,7 +218,7 @@ def render_podcast(api_key, script, voice1, voice2, num_hosts):
         for i, line in enumerate(lines):
             voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
             try:
-                _, audio = text_to_speech(line, voice)
                 audio_segments.append(audio)
             except Exception as e:
                 logger.error(f"Error processing audio segment: {str(e)}")
@@ -173,6 +238,7 @@ def render_podcast(api_key, script, voice1, voice2, num_hosts):
         logger.error(f"Error rendering podcast: {str(e)}")
         raise
 with gr.Blocks() as demo:
     gr.Markdown("# AI Podcast Generator")

 import numpy as np
 import re
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import snapshot_download, login
 import logging
 import os
 import spaces
 import warnings
+from snac import SNAC
+from dotenv import load_dotenv
+load_dotenv()
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 warnings.filterwarnings("ignore", category=RuntimeWarning)
 def get_device():
+    return "cuda" if torch.cuda.is_available() else "cpu"
 device = get_device()
 logger.info(f"Using device: {device}")
 model = None
 tokenizer = None
+snac_model = None
 @spaces.GPU()
 def load_model():
+    global model, tokenizer, snac_model
+    logger.info("Loading SNAC model...")
+    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+    snac_model = snac_model.to(device)
     logger.info("Loading Orpheus model...")
     model_name = "canopylabs/orpheus-3b-0.1-ft"
             ]
         )
+        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
         model.to(device)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         logger.info(f"Orpheus model and tokenizer loaded to {device}")
         logger.error(f"Error generating podcast script: {str(e)}")
         raise
+def process_prompt(prompt, voice, tokenizer, device):
+    prompt = f"{voice}: {prompt}"
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
+    end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
+    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
+    # No padding needed for single input
+    attention_mask = torch.ones_like(modified_input_ids)
+    return modified_input_ids.to(device), attention_mask.to(device)
+def parse_output(generated_ids):
+    token_to_find = 128257
+    token_to_remove = 128258
+    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
+    if len(token_indices[1]) > 0:
+        last_occurrence_idx = token_indices[1][-1].item()
+        cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
+    else:
+        cropped_tensor = generated_ids
+    processed_rows = []
+    for row in cropped_tensor:
+        masked_row = row[row != token_to_remove]
+        processed_rows.append(masked_row)
+    code_lists = []
+    for row in processed_rows:
+        row_length = row.size(0)
+        new_length = (row_length // 7) * 7
+        trimmed_row = row[:new_length]
+        trimmed_row = [t - 128266 for t in trimmed_row]
+        code_lists.append(trimmed_row)
+    return code_lists[0]  # Return just the first one for single sample
+def redistribute_codes(code_list, snac_model):
+    device = next(snac_model.parameters()).device  # Get the device of SNAC model
+    layer_1 = []
+    layer_2 = []
+    layer_3 = []
+    for i in range((len(code_list)+1)//7):
+        layer_1.append(code_list[7*i])
+        layer_2.append(code_list[7*i+1]-4096)
+        layer_3.append(code_list[7*i+2]-(2*4096))
+        layer_3.append(code_list[7*i+3]-(3*4096))
+        layer_2.append(code_list[7*i+4]-(4*4096))
+        layer_3.append(code_list[7*i+5]-(5*4096))
+        layer_3.append(code_list[7*i+6]-(6*4096))
+    # Move tensors to the same device as the SNAC model
+    codes = [
+        torch.tensor(layer_1, device=device).unsqueeze(0),
+        torch.tensor(layer_2, device=device).unsqueeze(0),
+        torch.tensor(layer_3, device=device).unsqueeze(0)
+    ]
+    audio_hat = snac_model.decode(codes)
+    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 @spaces.GPU()
+def text_to_speech(text, voice, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200):
+    global model, tokenizer, snac_model
+    if not text.strip():
+        return None
     try:
+        input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         with torch.no_grad():
+            generated_ids = model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=max_new_tokens,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                num_return_sequences=1,
+                eos_token_id=128258,
+            )
+        code_list = parse_output(generated_ids)
+        audio_samples = redistribute_codes(code_list, snac_model)
+        return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
         logger.error(f"Error in text_to_speech: {str(e)}")
         raise
 @spaces.GPU()
 def render_podcast(api_key, script, voice1, voice2, num_hosts):
     try:
         for i, line in enumerate(lines):
             voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
             try:
+                sample_rate, audio = text_to_speech(line, voice)
                 audio_segments.append(audio)
             except Exception as e:
                 logger.error(f"Error processing audio segment: {str(e)}")
         logger.error(f"Error rendering podcast: {str(e)}")
         raise
+# Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("# AI Podcast Generator")