File size: 1,936 Bytes
5124d5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr
import librosa
import torch

from src import CandidateGenerator

device = "cuda" if torch.cuda.is_available() else "cpu"
candidate_generator = CandidateGenerator(device)


def process_audio_text(audio_file, text, num_candidates):
    if audio_file is None or text.strip() == "":
        return "音声とテキストの両方が必要です。", "", ""

    audio_16khz, _ = librosa.load(audio_file, sr=16000)

    results = candidate_generator.generate(text, audio_16khz, int(num_candidates))

    candidates_output = []
    for candidate in results["candidates"]:
        candidates_output.append(
            f"mecab_cost: {candidate['mecab_cost']}, "
            f"ctc_loss: {candidate['ctc_loss']:.3f}, "
            f"phonemes: {' '.join(candidate['phonemes'])}"
        )
    candidates_str = "\n".join(candidates_output)

    hubert_pred = results["hubert_prediction"]
    hubert_output = (
        f"ctc_loss: {hubert_pred['ctc_loss']:.3f}, "
        f"phonemes: {' '.join(hubert_pred['phonemes'])}"
    )

    debug_output = repr(results)

    return candidates_str, hubert_output, debug_output


interface = gr.Interface(
    fn=process_audio_text,
    inputs=[
        gr.Audio(type="filepath", label="音声ファイル"),
        gr.Textbox(
            label="テキスト", placeholder="漢字仮名交じりのテキストを入力してください"
        ),
        gr.Slider(minimum=1, maximum=20, value=10, step=1, label="MeCab の候補数"),
    ],
    outputs=[
        gr.Textbox(label="候補", lines=10),
        gr.Textbox(label="HuBERT による予測", lines=1),
        gr.Textbox(label="全ての結果", lines=20),
    ],
    title="音声と漢字仮名交じりテキストからふりがなを推定するツール v2 (工事中)",
    description="音素認識モデルと MeCab による読みの推定を行います。",
)

if __name__ == "__main__":
    interface.launch()