mrmuminov's picture
Update app.py
c525cff verified
raw
history blame
2.68 kB
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import gradio as gr
from pydub import AudioSegment, silence
import tempfile
import torch
import torchaudio
MODEL_NAME = "islomov/navaistt_v1_medium"
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
def split_on_silence_with_duration_control(audio, min_len, max_len, silence_thresh=-40):
silences = silence.detect_silence(audio, min_silence_len=500, silence_thresh=silence_thresh)
silences = [((start + end) // 2) for start, end in silences]
chunks = []
start = 0
while start < len(audio):
end = min(start + max_len, len(audio))
candidates = [s for s in silences if start + min_len <= s <= end]
split_point = candidates[-1] if candidates else end
chunks.append(audio[start:split_point])
start = split_point
return chunks
def transcribe(audio_file):
# Load audio using pydub
audio = AudioSegment.from_file(audio_file)
# Convert to mono and 16kHz if needed
if audio.channels > 1:
audio = audio.set_channels(1)
if audio.frame_rate != 16000:
audio = audio.set_frame_rate(16000)
# Detect silent chunks
chunks = split_on_silence_with_duration_control(
audio, min_len=15000, max_len=25000, silence_thresh=-40
)
# Transcribe each chunk
results = []
for chunk in chunks:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile:
chunk.export(tmpfile.name, format="wav")
waveform, _ = torchaudio.load(tmpfile.name)
input_features = processor(
waveform.squeeze().numpy(),
sampling_rate=16000,
return_tensors="pt",
language="uz"
).input_features.to(device)
with torch.no_grad():
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
results.append(transcription)
return " ".join(results)
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath", label="Audio file"),
outputs="text",
title="Navai SST v1 Medium",
description="Transcribe Uzbek speech to text using Navai SST v1 Medium model. Upload an audio file to get started. Model: https://huggingface.co/islomov/navaistt_v1_medium"
)
with demo:
gr.TabbedInterface([file_transcribe], ["Audio file"])
demo.launch()