Spaces:

jsbeaudry
/

oswald-large-v3-turbo-m1

Sleeping

App Files Files Community

jsbeaudry commited on May 31

Commit

ad65f9d

verified ·

1 Parent(s): 1ee5c4a

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -9

app.py CHANGED Viewed

@@ -1,29 +1,57 @@
-from transformers import pipeline
 import gradio as gr
-# Load Whisper model
 print("Loading model...")
-pipe = pipeline(model="jsbeaudry/whisper-medium-oswald")
 print("Model loaded successfully.")
 # Transcription function
 def transcribe(audio):
     if audio is None:
         return "Please upload or record an audio file first."
-    result = pipe(audio)
-    return result["text"]
-# Build Gradio interface
 def create_interface():
     with gr.Blocks(title="Whisper Medium - Haitian Creole") as demo:
         gr.Markdown("# 🎙️ Whisper Medium Creole ASR")
         gr.Markdown(
-            "Upload an audio file or record your voice in Haitian Creole. "
-            "Then click **Transcribe** to see the result."
         )
         with gr.Row():
-            audio_input = gr.Audio(label="🎧 Upload or Record Audio", format="wav")
             transcribe_button = gr.Button("🔍 Transcribe")
             output_text = gr.Textbox(label="📝 Transcribed Text", lines=4)

+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import librosa
+import torch
 import gradio as gr
+# Load Whisper model and processor
 print("Loading model...")
+processor = AutoProcessor.from_pretrained("jsbeaudry/whisper-medium-oswald")
+model = AutoModelForSpeechSeq2Seq.from_pretrained("jsbeaudry/whisper-medium-oswald")
+model.eval()
+# Set device (GPU if available, else CPU)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
 print("Model loaded successfully.")
 # Transcription function
 def transcribe(audio):
     if audio is None:
         return "Please upload or record an audio file first."
+    # Gradio provides a tuple (sr, data)
+    sr, data = audio
+    # If stereo, convert to mono
+    if len(data.shape) == 2:
+        data = librosa.to_mono(data.T)
+    # Resample to 16kHz if needed
+    if sr != 16000:
+        data = librosa.resample(data, orig_sr=sr, target_sr=16000)
+        sr = 16000
+    # Process audio
+    input_features = processor(data, sampling_rate=sr, return_tensors="pt").input_features.to(device)
+    # Predict
+    with torch.no_grad():
+        predicted_ids = model.generate(input_features)
+    # Decode
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    return transcription
+# Gradio UI
 def create_interface():
     with gr.Blocks(title="Whisper Medium - Haitian Creole") as demo:
         gr.Markdown("# 🎙️ Whisper Medium Creole ASR")
         gr.Markdown(
+            "Upload or record your voice in Haitian Creole. Then click **Transcribe** to get the text."
         )
         with gr.Row():
+            audio_input = gr.Audio(label="🎧 Upload or Record Audio", type="numpy", format="wav")
             transcribe_button = gr.Button("🔍 Transcribe")
             output_text = gr.Textbox(label="📝 Transcribed Text", lines=4)