Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -104,8 +104,22 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
104 |
if chunk_waveform.shape[0] > 1:
|
105 |
chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
# Process the chunk with the tokenizer
|
108 |
-
processor = processor.add_special_tokens({'pad_token': '[PAD]'})
|
109 |
inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
|
110 |
|
111 |
input_features = inputs.input_features
|
@@ -120,7 +134,10 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
120 |
attention_mask=attention_mask.to(device),
|
121 |
**generate_kwargs
|
122 |
)
|
123 |
-
|
|
|
|
|
|
|
124 |
full_text.append(chunk_text)
|
125 |
|
126 |
# Combine the transcribed text from all chunks
|
|
|
104 |
if chunk_waveform.shape[0] > 1:
|
105 |
chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
|
106 |
|
107 |
+
# Process the chunk with the tokenizer
|
108 |
+
inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
|
109 |
+
|
110 |
+
input_features = inputs.input_features
|
111 |
+
|
112 |
+
# Create attention mask
|
113 |
+
attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
|
114 |
+
|
115 |
+
# ASR model inference on the chunk
|
116 |
+
with torch.no_grad():
|
117 |
+
generated_ids = model.generate(
|
118 |
+
input_features=input_features.to(device),
|
119 |
+
attention_mask=attention_mask.to(device),
|
120 |
+
**generate_kwargs
|
121 |
+
)
|
122 |
# Process the chunk with the tokenizer
|
|
|
123 |
inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
|
124 |
|
125 |
input_features = inputs.input_features
|
|
|
134 |
attention_mask=attention_mask.to(device),
|
135 |
**generate_kwargs
|
136 |
)
|
137 |
+
|
138 |
+
# new processor object with desired configuration
|
139 |
+
new_processor = processor.add_special_tokens({'pad_token': '[PAD]'})
|
140 |
+
chunk_text = new_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
|
141 |
full_text.append(chunk_text)
|
142 |
|
143 |
# Combine the transcribed text from all chunks
|