mrmuminov commited on
Commit
c525cff
·
verified ·
1 Parent(s): 7e52dc0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -31
app.py CHANGED
@@ -1,5 +1,7 @@
1
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
  import gradio as gr
 
 
3
  import torch
4
  import torchaudio
5
 
@@ -8,40 +10,57 @@ MODEL_NAME = "islomov/navaistt_v1_medium"
8
  processor = WhisperProcessor.from_pretrained(MODEL_NAME)
9
  model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def transcribe(audio_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- global model
15
- global processor
16
-
17
- # Move to GPU if available
18
- device = "cuda" if torch.cuda.is_available() else "cpu"
19
- model = model.to(device)
20
-
21
- # Load and preprocess audio
22
- waveform, sample_rate = torchaudio.load(audio_file)
23
- if sample_rate != 16000:
24
- waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
25
-
26
- # Convert to mono if needed
27
- if waveform.shape[0] > 1:
28
- waveform = waveform.mean(dim=0, keepdim=True)
29
-
30
- # Process audio
31
- input_features = processor(
32
- waveform.squeeze().numpy(),
33
- sampling_rate=16000,
34
- return_tensors="pt",
35
- language="uz"
36
- ).input_features.to(device)
37
-
38
- # Generate transcription
39
- with torch.no_grad():
40
- predicted_ids = model.generate(input_features)
41
-
42
- # Decode
43
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
44
- return transcription
45
 
46
  demo = gr.Blocks()
47
 
 
1
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
  import gradio as gr
3
+ from pydub import AudioSegment, silence
4
+ import tempfile
5
  import torch
6
  import torchaudio
7
 
 
10
  processor = WhisperProcessor.from_pretrained(MODEL_NAME)
11
  model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
12
 
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ model = model.to(device)
15
+
16
+ def split_on_silence_with_duration_control(audio, min_len, max_len, silence_thresh=-40):
17
+ silences = silence.detect_silence(audio, min_silence_len=500, silence_thresh=silence_thresh)
18
+ silences = [((start + end) // 2) for start, end in silences]
19
+
20
+ chunks = []
21
+ start = 0
22
+ while start < len(audio):
23
+ end = min(start + max_len, len(audio))
24
+ candidates = [s for s in silences if start + min_len <= s <= end]
25
+ split_point = candidates[-1] if candidates else end
26
+ chunks.append(audio[start:split_point])
27
+ start = split_point
28
+ return chunks
29
 
30
  def transcribe(audio_file):
31
+ # Load audio using pydub
32
+ audio = AudioSegment.from_file(audio_file)
33
+
34
+ # Convert to mono and 16kHz if needed
35
+ if audio.channels > 1:
36
+ audio = audio.set_channels(1)
37
+ if audio.frame_rate != 16000:
38
+ audio = audio.set_frame_rate(16000)
39
+
40
+ # Detect silent chunks
41
+ chunks = split_on_silence_with_duration_control(
42
+ audio, min_len=15000, max_len=25000, silence_thresh=-40
43
+ )
44
+
45
+ # Transcribe each chunk
46
+ results = []
47
+ for chunk in chunks:
48
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmpfile:
49
+ chunk.export(tmpfile.name, format="wav")
50
+ waveform, _ = torchaudio.load(tmpfile.name)
51
+ input_features = processor(
52
+ waveform.squeeze().numpy(),
53
+ sampling_rate=16000,
54
+ return_tensors="pt",
55
+ language="uz"
56
+ ).input_features.to(device)
57
+
58
+ with torch.no_grad():
59
+ predicted_ids = model.generate(input_features)
60
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
61
+ results.append(transcription)
62
 
63
+ return " ".join(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  demo = gr.Blocks()
66