Spaces:

camparchimedes
/

nb

Build error

camparchimedes commited on Aug 20, 2024

Commit

5074632

verified ·

1 Parent(s): 82b4370

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -104,8 +104,22 @@ def transcribe_audio(audio_file, chunk_length_s=30):
         if chunk_waveform.shape[0] > 1:
             chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
         # Process the chunk with the tokenizer
-        processor = processor.add_special_tokens({'pad_token': '[PAD]'})
         inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
         input_features = inputs.input_features
@@ -120,7 +134,10 @@ def transcribe_audio(audio_file, chunk_length_s=30):
                 attention_mask=attention_mask.to(device),
                 **generate_kwargs
             )
-            chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
             full_text.append(chunk_text)
     # Combine the transcribed text from all chunks

         if chunk_waveform.shape[0] > 1:
             chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
+# Process the chunk with the tokenizer
+inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
+input_features = inputs.input_features
+# Create attention mask
+attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
+# ASR model inference on the chunk
+with torch.no_grad():
+    generated_ids = model.generate(
+        input_features=input_features.to(device),
+        attention_mask=attention_mask.to(device),
+        **generate_kwargs
+    )
         # Process the chunk with the tokenizer
         inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
         input_features = inputs.input_features
                 attention_mask=attention_mask.to(device),
                 **generate_kwargs
             )
+            # new processor object with desired configuration
+            new_processor = processor.add_special_tokens({'pad_token': '[PAD]'})
+            chunk_text = new_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
             full_text.append(chunk_text)
     # Combine the transcribed text from all chunks