Spaces:

Athspi
/

Ai-audio

Sleeping

App Files Files Community

Athspi commited on Jan 11

Commit

5989272

verified ·

1 Parent(s): 3e73331

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -7

app.py CHANGED Viewed

@@ -123,10 +123,14 @@ LANGUAGE_NAME_TO_CODE = {
     "Sundanese": "su",
 }
-# Load the fine-tuned Sinhala model and processor
 processor = AutoProcessor.from_pretrained(SINHALA_MODEL)
 sinhala_model = AutoModelForSpeechSeq2Seq.from_pretrained(SINHALA_MODEL)
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
     """Transcribe the audio file."""
     # Load the appropriate model
@@ -153,14 +157,14 @@ def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faste
             raw_audio = np.array(raw_audio, dtype=np.float32)
             # Process the audio and generate transcription
-            inputs = model_processor(raw_audio, return_tensors="pt", sampling_rate=16000)
             with torch.no_grad():
-                generated_ids = model.generate(inputs.input_features)
             transcription = model_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
             detected_language = "si"
         else:
             # Use Whisper for auto-detection
-            result = model.transcribe(processed_audio_path, fp16=False)
             transcription = result["text"]
             detected_language = result.get("language", "unknown")
     else:
@@ -170,15 +174,15 @@ def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faste
             raw_audio = np.array(raw_audio, dtype=np.float32)
             # Process the audio and generate transcription
-            inputs = model_processor(raw_audio, return_tensors="pt", sampling_rate=16000)
             with torch.no_grad():
-                generated_ids = model.generate(inputs.input_features)
             transcription = model_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
             detected_language = "si"
         else:
             # Use Whisper for transcription with the selected language
             language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")
-            result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
             transcription = result["text"]
             detected_language = language_code

     "Sundanese": "su",
 }
+# Preload the fine-tuned Sinhala model and processor
 processor = AutoProcessor.from_pretrained(SINHALA_MODEL)
 sinhala_model = AutoModelForSpeechSeq2Seq.from_pretrained(SINHALA_MODEL)
+# Move model to GPU if available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+sinhala_model.to(device)
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
     """Transcribe the audio file."""
     # Load the appropriate model
             raw_audio = np.array(raw_audio, dtype=np.float32)
             # Process the audio and generate transcription
+            inputs = model_processor(raw_audio, return_tensors="pt", sampling_rate=16000).input_features.to(device)
             with torch.no_grad():
+                generated_ids = model.generate(inputs)
             transcription = model_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
             detected_language = "si"
         else:
             # Use Whisper for auto-detection
+            result = model.transcribe(processed_audio_path, fp16=(device == "cuda"))
             transcription = result["text"]
             detected_language = result.get("language", "unknown")
     else:
             raw_audio = np.array(raw_audio, dtype=np.float32)
             # Process the audio and generate transcription
+            inputs = model_processor(raw_audio, return_tensors="pt", sampling_rate=16000).input_features.to(device)
             with torch.no_grad():
+                generated_ids = model.generate(inputs)
             transcription = model_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
             detected_language = "si"
         else:
             # Use Whisper for transcription with the selected language
             language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")
+            result = model.transcribe(processed_audio_path, language=language_code, fp16=(device == "cuda"))
             transcription = result["text"]
             detected_language = language_code