Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

tahirsher commited on Mar 10

Commit

2e48e3c

verified ·

1 Parent(s): 81653fd

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -188,34 +188,25 @@ if audio_file:
     with open(audio_path, "wb") as f:
         f.write(audio_file.read())
-    # ✅ Ensure Model Precision Matches Input
-    if device == "cuda":
-        model.half()  # Use FP16 for speed on GPU
-    else:
-        model.float()  # Ensure CPU uses FP32
-    # ✅ Load and preprocess audio
     waveform, sample_rate = torchaudio.load(audio_path)
     waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
-    # ✅ Convert to input format (Match FP16 for GPU)
-    input_features = processor(
-        waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt"
-    ).input_features.to(device)
-    if device == "cuda":
-        input_features = input_features.half()  # ✅ Convert to FP16
-    # ✅ Optimized Inference
-    with torch.inference_mode():
         generated_ids = model.generate(
-            input_features,
-            max_length=200,  # ⏩ Reduced length for speed
-            num_beams=2,  # ⏩ Lower beams for faster decoding
-            do_sample=False,  # ⏩ Disables unnecessary sampling
-            use_cache=True  # ✅ Speeds up processing
         )
         transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     st.success("📄 Transcription:")
     st.write(transcription)

     with open(audio_path, "wb") as f:
         f.write(audio_file.read())
     waveform, sample_rate = torchaudio.load(audio_path)
     waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
+    input_features = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
+    input_tensor = input_features.to(device)
+    # ✅ FIX: Use `generate()` for Proper Transcription
+    with torch.no_grad():
         generated_ids = model.generate(
+            input_tensor,
+            max_length=500,
+            num_beams=5,
+            do_sample=True,
+            top_k=50
         )
         transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # Display transcription
     st.success("📄 Transcription:")
     st.write(transcription)