Spaces:

vankienemk
/

Voice-regconizer

Running

App Files Files Community

vankienemk commited on Apr 16

Commit

9b6c3a3

verified ·

1 Parent(s): ab43d17

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -39

app.py CHANGED Viewed

@@ -1,58 +1,89 @@
 import gradio as gr
 import torch
 import torchaudio
-from transformers import pipeline
 import numpy as np
-# Tải mô hình Ichigo-whisper
-model_id = "Menlo/Ichigo-whisper-v0.1"
-transcriber = pipeline("automatic-speech-recognition", model=model_id)
-def transcribe_stream(stream, new_chunk):
-    # Trích xuất sample rate và dữ liệu âm thanh
-    sr, y = new_chunk
-    # Chuyển về mono nếu là stereo
-    if y.ndim > 1:
-        y = y.mean(axis=1)
-    # Chuẩn hóa âm thanh
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1.0
-    # Nối với audio trước đó
-    if stream is not None:
-        stream = np.concatenate([stream, y])
-    else:
-        stream = y
-    # Dự đoán kết quả
-    result = transcriber({"sampling_rate": sr, "raw": stream})
-    return stream, result["text"]
 # Tạo giao diện Gradio
-title = "Ichigo Whisper Streaming Demo"
 description = """
-# 🍓 Ichigo Whisper Streaming Recognition
-Nhận dạng giọng nói theo thời gian thực với mô hình Menlo/Ichigo-whisper-v0.1.
 """
-# Tạo giao diện streaming
-streaming_demo = gr.Interface(
-    fn=transcribe_stream,
-    inputs=[
-        "state",
-        gr.Audio(sources=["microphone"], streaming=True)
-    ],
-    outputs=[
-        "state",
-        gr.Textbox(label="Phiên âm theo thời gian thực")
-    ],
-    live=True,
     title=title,
     description=description
 )
 # Khởi chạy ứng dụng
 if __name__ == "__main__":
-    streaming_demo.launch()

 import gradio as gr
 import torch
 import torchaudio
 import numpy as np
+from ichigo_asr.demo.utils import load_model
+# Hàm tải mô hình Ichigo Whisper
+def init_model():
+    # Tải Ichigo Whisper
+    try:
+        ichigo_model = load_model(
+            ref="homebrewltd/ichigo-whisper:merge-medium-vi-2d-2560c-dim64.pth",
+            size="merge-medium-vi-2d-2560c-dim64",
+        )
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        ichigo_model.ensure_whisper(device)
+        ichigo_model.to(device)
+        return ichigo_model, device
+    except Exception as e:
+        print(f"Lỗi khi tải mô hình: {e}")
+        return None, "cpu"
+# Khởi tạo mô hình khi ứng dụng bắt đầu
+ichigo_model, device = init_model()
+def transcribe(audio_path):
+    if ichigo_model is None:
+        return "Không thể tải mô hình. Vui lòng kiểm tra logs."
+    try:
+        # Tải file âm thanh
+        wav, sr = torchaudio.load(audio_path)
+        # Chuyển đổi sang 16kHz nếu cần
+        if sr != 16000:
+            wav = torchaudio.functional.resample(wav, sr, 16000)
+        # Chuyển đổi sang mono nếu là stereo
+        if wav.shape[0] > 1:
+            wav = wav.mean(dim=0, keepdim=True)
+        # Thực hiện dự đoán
+        transcribe_result = ichigo_model.inference(wav.to(device))
+        # Trả về kết quả
+        return transcribe_result[0].text
+    except Exception as e:
+        return f"Lỗi khi nhận dạng giọng nói: {str(e)}"
 # Tạo giao diện Gradio
+title = "Ichigo Whisper Speech Recognition Demo"
 description = """
+# 🍓 Ichigo Whisper Speech Recognition
+Sử dụng mô hình Ichigo-whisper để nhận dạng giọng nói.
+Mô hình này có hiệu suất tốt cho cả tiếng Anh và tiếng Việt!
+## Cách sử dụng:
+1. Nhấn vào nút microphone và nói
+2. Hoặc tải lên file audio
+3. Mô hình sẽ chuyển đổi giọng nói thành văn bản
+Chi tiết về mô hình: [Menlo/Ichigo-whisper-v0.1](https://huggingface.co/Menlo/Ichigo-whisper-v0.1)
 """
+# Tạo giao diện với hai tab: Microphone và Upload
+mic_transcribe = gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(sources="microphone", type="filepath"),
+    outputs=gr.Textbox(label="Phiên âm"),
     title=title,
     description=description
 )
+file_transcribe = gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(sources="upload", type="filepath"),
+    outputs=gr.Textbox(label="Phiên âm"),
+    title=title,
+    description=description
+)
+# Kết hợp các tab
+demo = gr.TabbedInterface(
+    [mic_transcribe, file_transcribe],
+    ["Microphone", "Upload Audio"]
+)
 # Khởi chạy ứng dụng
 if __name__ == "__main__":
+    demo.launch()