Spaces:

vankienemk
/

Voice-regconizer

Sleeping

File size: 1,523 Bytes

157f27b
a8f06f7
0c2f2f9
 
f8da254
157f27b
0c2f2f9
 
 
e387fff
0c2f2f9
 
 
 
 
 
 
 
 
 
 
e387fff
0c2f2f9
 
 
 
 
 
 
 
 
901f41e
0c2f2f9
 
 
 
 
 
157f27b
0c2f2f9

import gradio as gr
import torch
import torchaudio
from transformers import pipeline
import numpy as np

# Tải mô hình Ichigo-whisper
model_id = "Menlo/Ichigo-whisper-v0.1"
transcriber = pipeline("automatic-speech-recognition", model=model_id)

def transcribe_stream(stream, new_chunk):
    # Trích xuất sample rate và dữ liệu âm thanh
    sr, y = new_chunk
    
    # Chuyển về mono nếu là stereo
    if y.ndim > 1:
        y = y.mean(axis=1)
        
    # Chuẩn hóa âm thanh
    y = y.astype(np.float32)
    y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1.0

    # Nối với audio trước đó
    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
    
    # Dự đoán kết quả
    result = transcriber({"sampling_rate": sr, "raw": stream})
    return stream, result["text"]

# Tạo giao diện Gradio
title = "Ichigo Whisper Streaming Demo"
description = """
# 🍓 Ichigo Whisper Streaming Recognition
Nhận dạng giọng nói theo thời gian thực với mô hình Menlo/Ichigo-whisper-v0.1.
"""

# Tạo giao diện streaming
streaming_demo = gr.Interface(
    fn=transcribe_stream,
    inputs=[
        "state", 
        gr.Audio(sources=["microphone"], streaming=True)
    ],
    outputs=[
        "state",
        gr.Textbox(label="Phiên âm theo thời gian thực")
    ],
    live=True,
    title=title,
    description=description
)

# Khởi chạy ứng dụng
if __name__ == "__main__":
    streaming_demo.launch()