Spaces:

nenafem
/

flask_whisper

Runtime error

App Files Files Community

Michael Natanael commited on Apr 30

Commit

268f7eb

1 Parent(s): 7c09bf0

change whisper_open_ai to faster_whisper

Browse files

Files changed (2) hide show

app.py +56 -80
requirements.txt +4 -3

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from flask import Flask, render_template, request
-# import whisper
 import tempfile
 import os
 import time
@@ -7,7 +7,7 @@ import torch
 import numpy as np
 import requests
 from tqdm import tqdm
-from transformers import BertTokenizer, AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 from model.multi_class_model import MultiClassModel  # Adjust if needed
 app = Flask(__name__)
@@ -49,38 +49,58 @@ model = MultiClassModel.load_from_checkpoint(
 )
 model.eval()
-# === INITIAL SETUP: Whisper Pipeline ===
-# https://huggingface.co/openai/whisper-large-v3
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-model_id = "openai/whisper-large-v3"
-whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
-)
-whisper_model.to(device)
-processor = AutoProcessor.from_pretrained(model_id)
-pipe = pipeline(
-    "automatic-speech-recognition",
-    model=whisper_model,
-    tokenizer=processor.tokenizer,
-    feature_extractor=processor.feature_extractor,
-    chunk_length_s=30,
-    batch_size=128,  # batch size for inference - set based on your device
-    torch_dtype=torch_dtype,
-    device=device,
-    max_new_tokens=128,  # Limit text generation
-    return_timestamps=False,  # Save memory
-)
-def whisper_api(temp_audio_path):
-    result = pipe(temp_audio_path, generate_kwargs={"language": "indonesian", "task": "transcribe"})
-    print(result["text"])
-    return result
 # === ROUTES ===
@@ -108,35 +128,11 @@ def transcribe():
                 temp_audio_path = temp_audio.name
             # Step 1: Transcribe
-            # transcription = whisper_model.transcribe(temp_audio_path, language="id")
-            transcription = whisper_api(temp_audio_path)
             os.remove(temp_audio_path)
-            transcribed_text = transcription["text"]
             # Step 2: BERT Prediction
-            encoding = tokenizer.encode_plus(
-                transcribed_text,
-                add_special_tokens=True,
-                max_length=512,
-                return_token_type_ids=True,
-                padding="max_length",
-                return_attention_mask=True,
-                return_tensors='pt',
-            )
-            with torch.no_grad():
-                prediction = model(
-                    encoding["input_ids"],
-                    encoding["attention_mask"],
-                    encoding["token_type_ids"]
-                )
-            logits = prediction
-            probabilities = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
-            predicted_class = np.argmax(probabilities)
-            predicted_label = AGE_LABELS[predicted_class]
-            prob_results = [(label, f"{prob:.4f}") for label, prob in zip(AGE_LABELS, probabilities)]
             # Stop timer
             end_time = time.time()
@@ -167,28 +163,8 @@ def predict_text():
         # Start timer
         start_time = time.time()
-        encoding = tokenizer.encode_plus(
-            user_lyrics,
-            add_special_tokens=True,
-            max_length=512,
-            return_token_type_ids=True,
-            padding="max_length",
-            return_attention_mask=True,
-            return_tensors='pt',
-        )
-        with torch.no_grad():
-            prediction = model(
-                encoding["input_ids"],
-                encoding["attention_mask"],
-                encoding["token_type_ids"]
-            )
-        logits = prediction
-        probabilities = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
-        predicted_class = np.argmax(probabilities)
-        predicted_label = AGE_LABELS[predicted_class]
-        prob_results = [(label, f"{prob:.4f}") for label, prob in zip(AGE_LABELS, probabilities)]
         # End timer
         end_time = time.time()

 from flask import Flask, render_template, request
+from faster_whisper import WhisperModel
 import tempfile
 import os
 import time
 import numpy as np
 import requests
 from tqdm import tqdm
+from transformers import BertTokenizer
 from model.multi_class_model import MultiClassModel  # Adjust if needed
 app = Flask(__name__)
 )
 model.eval()
+# === INITIAL SETUP: Faster Whisper ===
+# https://github.com/SYSTRAN/faster-whisper
+faster_whisper_model_size = "large-v3"
+# Run on GPU with FP16
+# model = WhisperModel(model_size, device="cuda", compute_type="float16")
+# or run on GPU with INT8
+# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
+# or run on CPU with INT8
+faster_whisper_model = WhisperModel(faster_whisper_model_size, device="cpu", compute_type="int8")
+def faster_whisper(temp_audio_path):
+    segments, info = faster_whisper_model.transcribe(
+        temp_audio_path,
+        language="id",
+        beam_size=1    # Lower beam_size, faster but may miss words
+    )
+    print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
+    for segment in segments:
+        print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
+    return segment.text
+def bert_predict(input_lyric):
+    encoding = tokenizer.encode_plus(
+        input_lyric,
+        add_special_tokens=True,
+        max_length=512,
+        return_token_type_ids=True,
+        padding="max_length",
+        return_attention_mask=True,
+        return_tensors='pt',
+    )
+    with torch.no_grad():
+        prediction = model(
+            encoding["input_ids"],
+            encoding["attention_mask"],
+            encoding["token_type_ids"]
+        )
+    logits = prediction
+    probabilities = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    predicted_class = np.argmax(probabilities)
+    predicted_label = AGE_LABELS[predicted_class]
+    prob_results = [(label, f"{prob:.4f}") for label, prob in zip(AGE_LABELS, probabilities)]
+    return predicted_label, prob_results
 # === ROUTES ===
                 temp_audio_path = temp_audio.name
             # Step 1: Transcribe
+            transcribed_text = faster_whisper(temp_audio_path)
             os.remove(temp_audio_path)
             # Step 2: BERT Prediction
+            predicted_label, prob_results = bert_predict(transcribed_text)
             # Stop timer
             end_time = time.time()
         # Start timer
         start_time = time.time()
+        # Step 1: BERT Prediction
+        predicted_label, prob_results = bert_predict(user_lyrics)
         # End timer
         end_time = time.time()

requirements.txt CHANGED Viewed

@@ -7,12 +7,13 @@ Jinja2==2.11.3
 MarkupSafe==1.1.1
 SQLAlchemy==1.3.22
 Werkzeug==1.0.1
-openai-whisper
-setuptools-rust
 # ffmpeg
 # ffmpeg-python
 # imageio[ffmpeg]
-accelerate
 pytorch-lightning==2.2.1
 lightning==2.4.0
 torch==2.2.0

 MarkupSafe==1.1.1
 SQLAlchemy==1.3.22
 Werkzeug==1.0.1
+faster_whisper
+# openai-whisper
+# setuptools-rust
 # ffmpeg
 # ffmpeg-python
 # imageio[ffmpeg]
+# accelerate
 pytorch-lightning==2.2.1
 lightning==2.4.0
 torch==2.2.0