import torch from transformers import VitsModel, AutoTokenizer import numpy as np class AudioGenerator: def __init__(self): print("Initializing Audio Generator...") self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {self.device}") # Load model and tokenizer self.model_name = "facebook/mms-tts-eng" print(f"Loading model {self.model_name}...") self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.model = VitsModel.from_pretrained(self.model_name).to(self.device) print(f"Model loaded and moved to {self.device}") def generate_audio(self, text, voice_preset=None): """ Generate audio from text using the MMS-TTS model Args: text (str): The text to convert to speech voice_preset (str): Not used in this implementation Returns: tuple: (audio_array, sample_rate) """ try: print(f"Generating audio on {self.device}...") # Tokenize the input text inputs = self.tokenizer(text, return_tensors="pt").to(self.device) with torch.no_grad(): output = self.model(**inputs).waveform # Convert to numpy array and normalize audio = output.cpu().numpy().squeeze() audio = (audio * 32767).astype(np.int16) return audio, self.model.config.sampling_rate except Exception as e: return f"Error generating audio: {str(e)}"