File size: 3,663 Bytes
0976f5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
from huggingface_hub import hf_hub_download
import gradio as gr
import json
import pandas as pd
import collections
import scipy.signal
import numpy as np
from functools import partial
from openwakeword.model import Model

from openwakeword.utils import download_models
download_models()

# 用 Secret token 從 HF Model Hub 下載私有模型
hf_token = os.environ.get("HF_TOKEN")
model_path = hf_hub_download(
    repo_id="JTBTechnology/kmu_wakeword",
    filename="hi_kmu_0721.onnx",   # 改成你模型內的正確檔名
    token=hf_token,
    repo_type="model"
)

# 直接用下載的模型路徑載入
model = Model(wakeword_models=[model_path], inference_framework="onnx")

# Define function to process audio
# def process_audio(audio, state=collections.defaultdict(partial(collections.deque, maxlen=60))):
def process_audio(audio, state=None):
    if state is None:
        state = collections.defaultdict(partial(collections.deque, maxlen=60))
    # Resample audio to 16khz if needed
    if audio[0] != 16000:
        data = scipy.signal.resample(audio[1], int(float(audio[1].shape[0])/audio[0]*16000))    
    
    # Get predictions
    for i in range(0, data.shape[0], 1280):
        if len(data.shape) == 2 or data.shape[-1] == 2:
            chunk = data[i:i+1280][:, 0]  # just get one channel of audio
        else:
            chunk = data[i:i+1280]

        if chunk.shape[0] == 1280:
            prediction = model.predict(chunk)
            for key in prediction:
                #Fill deque with zeros if it's empty
                if len(state[key]) == 0:
                    state[key].extend(np.zeros(60))
                    
                # Add prediction
                state[key].append(prediction[key])
    
    # Make line plot
    dfs = []
    for key in state.keys():
        df = pd.DataFrame({"x": np.arange(len(state[key])), "y": state[key], "Model": key})
        dfs.append(df)
    
    df = pd.concat(dfs)

    plot = gr.LinePlot(
        value=df,
        x='x',
        y='y',
        color="Model",
        y_lim=(0,1),
        tooltip="Model",
        width=600,
        height=300,
        x_title="Time (frames)",
        y_title="Model Score",
        color_legend_position="bottom"
    )
    # 1. 將 state 轉成可 JSON 序列化格式(dict of lists)
    serializable_state = {k: [float(x) for x in v] for k, v in state.items()}

    # 2. 回傳 serializable_state 給 Gradio
    return plot, serializable_state

# Create Gradio interface and launch

desc = """
這是 [openWakeWord](https://github.com/dscripka/openWakeWord) 最新版本預設模型的小工具示範。
請點一下下面的「開始錄音」按鈕,就能直接用麥克風測試。
系統會即時把每個模型的分數用折線圖秀出來,你也可以把滑鼠移到線上看是哪一個模型。

每一個模型都有自己專屬的喚醒詞或指令句(更多可以參考 [模型說明](https://github.com/dscripka/openWakeWord/tree/main/docs/models))。
如果偵測到你講了對的關鍵詞,圖上對應模型的分數會突然變高。你可以試著講下面的範例語句試試看:

| 模型名稱          | 建議語句   |
| ------------- | ------ |
| hi\_kmu\_0721 | 「嗨,高醫」 |
"""

gr_int = gr.Interface(
    title = "語音喚醒展示",
    description = desc,
    css = ".flex {flex-direction: column} .gr-panel {width: 100%}",
    fn=process_audio,
    inputs=[
        gr.Audio(sources=["microphone"], type="numpy", streaming=True, show_label=False), 
        "state"
    ],
    outputs=[
        gr.LinePlot(show_label=False),
        "state"
    ],
    live=True)

gr_int.launch()