import networkx as nx from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize import gradio as gr import warnings import torch from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq from pydub import AudioSegment import soundfile as sf import numpy as np from fpdf import FPDF from PIL import Image import time import os import spaces warnings.filterwarnings("ignore") HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN') model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #torch_dtype = torch.float32 model.to(device) #asr = pipeline("automatic-speech-recognition", model=model, processor=processor.tokenizer, device=device, torch_dtype=torch.float32) pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch.float32) def transcribe_audio(audio_file): with torch.no_grad(): output = pipe(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"}) return output["text"] # Gradio UI iface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath"), outputs="text", title="Audio Transcription App", description="Upload an audio file to get the transcription", theme="default", live=False ) # summarization model summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base") summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") # t5-base to device summarization_model.to(device) def convert_to_wav(audio_file): audio = AudioSegment.from_file(audio_file, format="m4a") wav_file = "temp.wav" audio.export(wav_file, format="wav") return wav_file # Configure_is__not good enough #if processor.tokenizer.pad_token_id is None: #processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + 1 # Sanity check #assert processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id, \ #"pad_token_id and eos_token_id must be distinct..and they is not" @spaces.GPU() # transcription def transcribe_audio(audio_file, batch_size=4): start_time = time.time() if audio_file.endswith(".m4a"): audio_file = convert_to_wav(audio_file) audio_input, sample_rate = sf.read(audio_file) chunk_size = 16000 * 30 chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] transcription = "" for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i + batch_size] inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) inputs = inputs.to(device) attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None with torch.no_grad(): output = model.generate( inputs.input_features, max_length=2048, num_beams=8, task="transcribe", attention_mask=attention_mask, language="no", **encoded_input, pad_token_id=tokenizer.eos_token_id, # pad_token_id=processor.tokenizer.pad_token_id, # eos_token_id=processor.tokenizer.eos_token_id ) transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " end_time = time.time() transcription_time = end_time - start_time word_count = len(transcription.split()) result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}" return transcription.strip(), result # Graph-based summarization|TextRank def summarize_text(text): sentences = sent_tokenize(text) if len(sentences) == 0: return "" tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(tfidf_matrix) nx_graph = nx.from_numpy_array(similarity_matrix) scores = nx.pagerank(nx_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # Select top N sentences (e.g., 3 sentences for the summary) top_n = 3 summary = " ".join([s for _, s in ranked_sentences[:top_n]]) return summary # HTML syntax for imagery image_html = """