Spaces:

camparchimedes
/

nb

Build error

camparchimedes commited on Aug 22, 2024

Commit

cb06cac

verified ·

1 Parent(s): a086817

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -108,24 +108,13 @@ def transcribe_audio(audio_file, chunk_length_s=30):
         inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
         input_features = inputs.input_features
-# Create attention mask
-attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
-# ASR model inference on the chunk
-with torch.no_grad():
-    generated_ids = model.generate(
-        input_features=input_features.to(device),
-        attention_mask=attention_mask.to(device),
-        **generate_kwargs
-    )
-        # Process the chunk with the tokenizer
-        inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
-        input_features = inputs.input_features
         # Create attention mask
         attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
         # ASR model inference on the chunk
         with torch.no_grad():
             generated_ids = model.generate(
@@ -161,9 +150,9 @@ with torch.no_grad():
     "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
     )
     return text, result
 # Clean and preprocess/@summarization
 def clean_text(text):
     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)

         inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
         input_features = inputs.input_features
         # Create attention mask
         attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
+        # Check the dimensions and values of the attention mask
+        assert attention_mask.shape == (1, input_features.shape[1]), "Attention mask dimensions do not match the input features."
+        assert (attention_mask.sum().item() == input_features.shape[1]), "Attention mask has incorrect values."
         # ASR model inference on the chunk
         with torch.no_grad():
             generated_ids = model.generate(
     "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
     )
     return text, result
 # Clean and preprocess/@summarization
 def clean_text(text):
     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)