camparchimedes commited on
Commit
5074632
·
verified ·
1 Parent(s): 82b4370

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -2
app.py CHANGED
@@ -104,8 +104,22 @@ def transcribe_audio(audio_file, chunk_length_s=30):
104
  if chunk_waveform.shape[0] > 1:
105
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  # Process the chunk with the tokenizer
108
- processor = processor.add_special_tokens({'pad_token': '[PAD]'})
109
  inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
110
 
111
  input_features = inputs.input_features
@@ -120,7 +134,10 @@ def transcribe_audio(audio_file, chunk_length_s=30):
120
  attention_mask=attention_mask.to(device),
121
  **generate_kwargs
122
  )
123
- chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
 
124
  full_text.append(chunk_text)
125
 
126
  # Combine the transcribed text from all chunks
 
104
  if chunk_waveform.shape[0] > 1:
105
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
106
 
107
+ # Process the chunk with the tokenizer
108
+ inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
109
+
110
+ input_features = inputs.input_features
111
+
112
+ # Create attention mask
113
+ attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
114
+
115
+ # ASR model inference on the chunk
116
+ with torch.no_grad():
117
+ generated_ids = model.generate(
118
+ input_features=input_features.to(device),
119
+ attention_mask=attention_mask.to(device),
120
+ **generate_kwargs
121
+ )
122
  # Process the chunk with the tokenizer
 
123
  inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
124
 
125
  input_features = inputs.input_features
 
134
  attention_mask=attention_mask.to(device),
135
  **generate_kwargs
136
  )
137
+
138
+ # new processor object with desired configuration
139
+ new_processor = processor.add_special_tokens({'pad_token': '[PAD]'})
140
+ chunk_text = new_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
141
  full_text.append(chunk_text)
142
 
143
  # Combine the transcribed text from all chunks