Spaces:
Running
Running
import torch | |
from transformers import VitsModel, AutoTokenizer | |
import numpy as np | |
class AudioGenerator: | |
def __init__(self): | |
print("Initializing Audio Generator...") | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {self.device}") | |
# Load model and tokenizer | |
self.model_name = "facebook/mms-tts-eng" | |
print(f"Loading model {self.model_name}...") | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
self.model = VitsModel.from_pretrained(self.model_name).to(self.device) | |
print(f"Model loaded and moved to {self.device}") | |
def generate_audio(self, text, voice_preset=None): | |
""" | |
Generate audio from text using the MMS-TTS model | |
Args: | |
text (str): The text to convert to speech | |
voice_preset (str): Not used in this implementation | |
Returns: | |
tuple: (audio_array, sample_rate) | |
""" | |
try: | |
print(f"Generating audio on {self.device}...") | |
# Tokenize the input text | |
inputs = self.tokenizer(text, return_tensors="pt").to(self.device) | |
with torch.no_grad(): | |
output = self.model(**inputs).waveform | |
# Convert to numpy array and normalize | |
audio = output.cpu().numpy().squeeze() | |
audio = (audio * 32767).astype(np.int16) | |
return audio, self.model.config.sampling_rate | |
except Exception as e: | |
return f"Error generating audio: {str(e)}" |