Spaces:
Sleeping
Sleeping
File size: 1,632 Bytes
712d204 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
from transformers import SeamlessM4Tv2Model, AutoProcessor
import numpy as np
import torch
from pydub import AudioSegment
# Load processor and model
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
def translate_audio(audio_file):
if audio_file is None:
return "No audio file detected. Please try again."
try:
# Set the device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Reset audio file pointer and load audio
audio = AudioSegment.from_file(audio_file, format="wav")
audio = audio.set_frame_rate(16000).set_channels(1)
# Convert audio to float32 NumPy array
audio_array = np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0
# Process input
audio_inputs = processor(audios=audio_array, sampling_rate=16000, return_tensors="pt")
audio_inputs = {key: val.to(device) for key, val in audio_inputs.items()} # Ensure tensors are on the correct device
# Generate translation
output_tokens = model.generate(**audio_inputs, tgt_lang="eng", generate_speech=False)
# Extract token IDs from the generated output
token_ids = output_tokens.sequences
# Decode token IDs to text
translated_text_from_audio = processor.batch_decode(token_ids, skip_special_tokens=True)[0]
return translated_text_from_audio
except Exception as e:
return f"Error during audio translation: {e}"
|