import gradio as gr import torch import numpy as np import soundfile as sf import librosa from transformers import pipeline pipe = pipeline( "automatic-speech-recognition", model="antony66/whisper-large-v3-russian", torch_dtype=torch.float16, device=0 if torch.cuda.is_available() else -1 ) def transcribe(audio_data): print(f"Received audio data: {audio_data}") if audio_data is None: return "Ошибка: не получены аудиоданные" wav_file = "temp_audio.wav" if isinstance(audio_data, tuple): audio_array, sample_rate = audio_data sf.write(wav_file, audio_array, sample_rate) elif isinstance(audio_data, str): audio_array, sample_rate = librosa.load(audio_data, sr=16000) sf.write(wav_file, audio_array, sample_rate) else: return "Ошибка: неизвестный формат аудиоданных" result = pipe(wav_file) return result["text"] with gr.Blocks() as app: gr.Markdown("## Распознавание речи с Whisper") audio_data = gr.Audio(type="filepath") text_output = gr.Textbox(label="Распознанный текст") btn = gr.Button("Распознать") btn.click(transcribe, inputs=audio_data, outputs=text_output) app.launch(debug=True)