Spaces:
Build error
Build error
| # app.py | |
| import gradio as gr | |
| import warnings | |
| import torch | |
| from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor | |
| import soundfile as sf | |
| import ffmpeg | |
| import spaces | |
| warnings.filterwarnings("ignore") | |
| # Load tokenizer, model, and processor | |
| tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium") | |
| model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium") | |
| processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium") | |
| # Set up device | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| torch_dtype = torch.float32 | |
| # Move model to device | |
| model.to(device) | |
| def convert_audio_format(audio_path): | |
| output_path = "converted_audio.wav" | |
| ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True) | |
| return output_path | |
| def transcribe_audio(audio_file, batch_size=4): | |
| audio_path = convert_audio_format(audio_file) | |
| audio_input, sample_rate = sf.read(audio_path) | |
| chunk_size = 16000 * 28 # 28 seconds chunks | |
| chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] | |
| transcription = "" | |
| for i in range(0, len(chunks), batch_size): | |
| batch_chunks = chunks[i:i + batch_size] | |
| inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) | |
| inputs = inputs.to(device) | |
| attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None | |
| with torch.no_grad(): | |
| output = model.generate( | |
| inputs.input_features, | |
| max_length=1024, | |
| num_beams=7, | |
| attention_mask=attention_mask | |
| ) | |
| transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " | |
| return transcription.strip() | |
| # HTML | Banner image | |
| banner_html = """ | |
| <div style="text-align: center;"> | |
| <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto"> | |
| </div> | |
| <div style="text-align: center; margin-top: 20px;"> | |
| <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="image" width="50%" height="auto"> | |
| </div> | |
| """ | |
| # Gradio interface | |
| iface = gr.Blocks() | |
| with iface: | |
| gr.HTML(banner_html) | |
| gr.Markdown("# ππ―π’ππ’π ππππ ππΌπΎπ¦Ύβ‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file:β") | |
| audio_input = gr.Audio(type="filepath") | |
| batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size") | |
| transcription_output = gr.Textbox() | |
| transcribe_button = gr.Button("Transcribe") | |
| transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output) | |
| # Launch interface | |
| iface.launch(share=True, debug=True) | |