# app.py import gradio as gr import warnings import torch from transformers import pipeline, WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor warnings.filterwarnings("ignore") # Load tokenizer and model tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium") model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium") processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium") # Set up the device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # Initialize pipeline asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch_dtype) def transcribe_audio(audio_file): # Perform transcription with torch.no_grad(): output = asr(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 5, "task": "transcribe", "language": "no"}) return output["text"] # Create Gradio interface iface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath"), outputs="text", title="Audio Transcription App", description="Upload an audio file to get the transcription", theme="default", layout="vertical", live=False ) from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM from pydub import AudioSegment import soundfile as sf import numpy as np import os import nltk from fpdf import FPDF import time nltk.download('punkt') # transcription processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") # summarization summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/norbert-summarization") summarization_model = AutoModelForSeq2SeqLM.from_pretrained("NbAiLab/norbert-summarization") # setup device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # move 'em transcription_model.to(device) summarization_model.to(device) # PS. model needs to be told to use graph-based summary method (Lexname?) def convert_to_wav(audio_file): audio = AudioSegment.from_file(audio_file, format="m4a") wav_file = "temp.wav" audio.export(wav_file, format="wav") return wav_file def transcribe_audio(audio_file, batch_size=4): start_time = time.time() # Convert .m4a to .wav if audio_file.endswith(".m4a"): audio_file = convert_to_wav(audio_file) audio_input, sample_rate = sf.read(audio_file) chunk_size = 16000 * 30 chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] transcription = "" for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i + batch_size] inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) inputs = inputs.to(device) attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None with torch.no_grad(): output = transcription_model.generate( inputs.input_features, max_length=2048, # Increase max_length for longer outputs num_beams=7, task="transcribe", attention_mask=attention_mask, # forced_decoder_ids=None, # OBS! forced_decoder_ids must not be set. Just marked it out for, just in case.. language="no" ) transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " end_time = time.time() transcription_time = end_time - start_time word_count = len(transcription.split()) result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}" return transcription.strip(), result def summarize_text(text): inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True) inputs = inputs.to(device) summary_ids = summarization_model.generate(inputs.input_ids, num_beams=4, max_length=150, early_stopping=True) summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary # HTML syntax for imagery image_html = """