Spaces:
Sleeping
Sleeping
File size: 1,711 Bytes
88dc3ba 5d9c950 b2593bc 5d9c950 527e644 b2593bc f47a9e0 78cc121 a1917fb b2593bc f47a9e0 b2593bc 038e82c 7d6796c a1917fb 52fc07d a1917fb 52fc07d a1917fb 52fc07d 0cae98f 78cc121 52fc07d 038e82c 52fc07d 038e82c 52fc07d 038e82c 52fc07d 527e644 b2593bc 038e82c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import subprocess
subprocess.run(["pip", "install", "gradio", "--upgrade"])
subprocess.run(["pip", "install", "transformers"])
subprocess.run(["pip", "install", "torchaudio", "--upgrade"])
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import torch
# Load model and processor
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-italian")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-italian")
# Function to perform ASR on audio data
def transcribe_audio(audio_data):
print("Received audio data:", audio_data) # Debug print
if audio_data is None or len(audio_data) != 2:
return "Invalid audio data format."
try:
# Extract sample rate and audio waveform from the tuple
sample_rate, waveform = audio_data
# Convert audio data to mono and normalize
audio_data = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
audio_data = torchaudio.functional.gain(audio_data, gain_db=5.0)
# Apply custom preprocessing to the audio data if needed
input_values = processor(torch.tensor(audio_data[0]), return_tensors="pt").input_values
# Perform ASR
with torch.no_grad():
logits = model(input_values).logits
# Decode the output
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
return transcription[0]
except Exception as e:
return f"An error occurred: {str(e)}"
# Create Gradio interface
audio_input = gr.Audio()
gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs="text").launch()
|