Spaces:

prj-beatrice
/

furigana-from-speech-and-text-v2

Sleeping

furigana-from-speech-and-text-v2 / app.py

Project Beatrice

Initial commit

5124d5d about 2 months ago

1.94 kB

	import gradio as gr
	import librosa
	import torch

	from src import CandidateGenerator

	device = "cuda" if torch.cuda.is_available() else "cpu"
	candidate_generator = CandidateGenerator(device)


	def process_audio_text(audio_file, text, num_candidates):
	if audio_file is None or text.strip() == "":
	return "音声とテキストの両方が必要です。", "", ""

	audio_16khz, _ = librosa.load(audio_file, sr=16000)

	results = candidate_generator.generate(text, audio_16khz, int(num_candidates))

	candidates_output = []
	for candidate in results["candidates"]:
	candidates_output.append(
	f"mecab_cost: {candidate['mecab_cost']}, "
	f"ctc_loss: {candidate['ctc_loss']:.3f}, "
	f"phonemes: {' '.join(candidate['phonemes'])}"
	)
	candidates_str = "\n".join(candidates_output)

	hubert_pred = results["hubert_prediction"]
	hubert_output = (
	f"ctc_loss: {hubert_pred['ctc_loss']:.3f}, "
	f"phonemes: {' '.join(hubert_pred['phonemes'])}"
	)

	debug_output = repr(results)

	return candidates_str, hubert_output, debug_output


	interface = gr.Interface(
	fn=process_audio_text,
	inputs=[
	gr.Audio(type="filepath", label="音声ファイル"),
	gr.Textbox(
	label="テキスト", placeholder="漢字仮名交じりのテキストを入力してください"
	),
	gr.Slider(minimum=1, maximum=20, value=10, step=1, label="MeCab の候補数"),
	],
	outputs=[
	gr.Textbox(label="候補", lines=10),
	gr.Textbox(label="HuBERT による予測", lines=1),
	gr.Textbox(label="全ての結果", lines=20),
	],
	title="音声と漢字仮名交じりテキストからふりがなを推定するツール v2 (工事中)",
	description="音素認識モデルと MeCab による読みの推定を行います。",
	)

	if __name__ == "__main__":
	interface.launch()