import gradio as gr import librosa import torch from src import CandidateGenerator device = "cuda" if torch.cuda.is_available() else "cpu" candidate_generator = CandidateGenerator(device) def process_audio_text(audio_file, text, num_candidates): if audio_file is None or text.strip() == "": return "音声とテキストの両方が必要です。", "", "" audio_16khz, _ = librosa.load(audio_file, sr=16000) results = candidate_generator.generate(text, audio_16khz, int(num_candidates)) candidates_output = [] for candidate in results["candidates"]: candidates_output.append( f"mecab_cost: {candidate['mecab_cost']}, " f"ctc_loss: {candidate['ctc_loss']:.3f}, " f"phonemes: {' '.join(candidate['phonemes'])}" ) candidates_str = "\n".join(candidates_output) hubert_pred = results["hubert_prediction"] hubert_output = ( f"ctc_loss: {hubert_pred['ctc_loss']:.3f}, " f"phonemes: {' '.join(hubert_pred['phonemes'])}" ) debug_output = repr(results) return candidates_str, hubert_output, debug_output interface = gr.Interface( fn=process_audio_text, inputs=[ gr.Audio(type="filepath", label="音声ファイル"), gr.Textbox( label="テキスト", placeholder="漢字仮名交じりのテキストを入力してください" ), gr.Slider(minimum=1, maximum=20, value=10, step=1, label="MeCab の候補数"), ], outputs=[ gr.Textbox(label="候補", lines=10), gr.Textbox(label="HuBERT による予測", lines=1), gr.Textbox(label="全ての結果", lines=20), ], title="音声と漢字仮名交じりテキストからふりがなを推定するツール v2 (工事中)", description="音素認識モデルと MeCab による読みの推定を行います。", ) if __name__ == "__main__": interface.launch()