|
import gradio as gr |
|
import librosa |
|
import torch |
|
|
|
from src import CandidateGenerator |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
candidate_generator = CandidateGenerator(device) |
|
|
|
|
|
def process_audio_text(audio_file, text, num_candidates): |
|
if audio_file is None or text.strip() == "": |
|
return "音声とテキストの両方が必要です。", "", "" |
|
|
|
audio_16khz, _ = librosa.load(audio_file, sr=16000) |
|
|
|
results = candidate_generator.generate(text, audio_16khz, int(num_candidates)) |
|
|
|
candidates_output = [] |
|
for candidate in results["candidates"]: |
|
candidates_output.append( |
|
f"mecab_cost: {candidate['mecab_cost']}, " |
|
f"ctc_loss: {candidate['ctc_loss']:.3f}, " |
|
f"phonemes: {' '.join(candidate['phonemes'])}" |
|
) |
|
candidates_str = "\n".join(candidates_output) |
|
|
|
hubert_pred = results["hubert_prediction"] |
|
hubert_output = ( |
|
f"ctc_loss: {hubert_pred['ctc_loss']:.3f}, " |
|
f"phonemes: {' '.join(hubert_pred['phonemes'])}" |
|
) |
|
|
|
debug_output = repr(results) |
|
|
|
return candidates_str, hubert_output, debug_output |
|
|
|
|
|
interface = gr.Interface( |
|
fn=process_audio_text, |
|
inputs=[ |
|
gr.Audio(type="filepath", label="音声ファイル"), |
|
gr.Textbox( |
|
label="テキスト", placeholder="漢字仮名交じりのテキストを入力してください" |
|
), |
|
gr.Slider(minimum=1, maximum=20, value=10, step=1, label="MeCab の候補数"), |
|
], |
|
outputs=[ |
|
gr.Textbox(label="候補", lines=10), |
|
gr.Textbox(label="HuBERT による予測", lines=1), |
|
gr.Textbox(label="全ての結果", lines=20), |
|
], |
|
title="音声と漢字仮名交じりテキストからふりがなを推定するツール v2 (工事中)", |
|
description="音素認識モデルと MeCab による読みの推定を行います。", |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch() |
|
|