import time import os import spaces import contextlib import warnings warnings.filterwarnings("ignore") from pydub import AudioSegment # If m4a audio, convert to wav (Python) def convert_to_wav(audio_file): audio = AudioSegment.from_file(audio_file, format="m4a") wav_file = "temp.wav" audio.export(wav_file, format="wav") return wav_file import torch from transformers import AutoProcessor, pipeline # Initialize processor and pipeline processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # Set distinct pad and eos tokens if processor.tokenizer.pad_token_id is None: processor.tokenizer.pad_token_id = processor.tokenizer.convert_tokens_to_ids("[PAD]") if processor.tokenizer.eos_token_id is None: processor.tokenizer.eos_token_id = processor.tokenizer.convert_tokens_to_ids("[EOS]") pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", torch_dtype=torch_dtype) #language = "no" # task = "transcribe" def transcribe_audio(audio_file): if audio_file.endswith(".m4a"): audio_file = convert_to_wav(audio_file) start_time = time.time() with torch.no_grad(): output = pipe( audio_file, chunk_length_s=30, generate_kwargs={ # "task": task, "pad_token_id": processor.tokenizer.pad_token_id, "eos_token_id": processor.tokenizer.eos_token_id } ) text = output["text"] end_time = time.time() output_time = end_time - start_time word_count = len(text.split()) result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {word_count}" return text, result # [VERSION 3: full-on w/ 3 styles for summarization] import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords import networkx as nx from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import numpy as np import re nltk.download('punkt') nltk.download('stopwords') WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip())) def clean_text(text): text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE) text = re.sub(r'\', ' ', str(text)) text = re.sub(r'\'', '', str(text)) text = re.sub(r'«', '', str(text)) text = re.sub(r'»', '', str(text)) text = re.sub(r'–', '-', str(text)) text = re.sub(r'…', '.', str(text)) text = re.sub(r'[^\x00-\x7F]+', ' ', str(text)) return text def preprocess_text(text): try: words = word_tokenize(text) stop_words = set(stopwords.words('norwegian')) words_without_stopwords = [word for word in words if word.lower() not in stop_words] processed_text = ' '.join(words_without_stopwords) return processed_text except Exception as e: st.error(f"Error during text preprocessing: {e}") return None from transformers import AutoTokenizer, AutoModelForSeq2SeqLM summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16) summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base") summarization_model.to(device) def summarize_text(text): preprocessed_text = preprocess_text(text) if preprocessed_text is None: return None inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True) inputs = inputs.to(device) summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True) summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary def build_similarity_matrix(sentences, stop_words): similarity_matrix = nx.Graph() for i, tokens_a in enumerate(sentences): for j, tokens_b in enumerate(sentences): if i != j: common_words = set(tokens_a) & set(tokens_b) similarity_matrix.add_edge(i, j, weight=len(common_words)) return similarity_matrix def graph_based_summary(text, num_paragraphs=3): sentences = text.strip().split(".") if len(sentences) < num_paragraphs: return sentences sentence_tokens = [word_tokenize(sent) for sent in sentences] stop_words = set(stopwords.words('norwegian')) filtered_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in sentence_tokens] similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words) scores = nx.pagerank(similarity_matrix) ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True) summary = [sent for _, sent in ranked_sentences[:num_paragraphs]] return summary def lex_rank_summary(text, num_paragraphs=3, threshold=0.1): sentences = nltk.sent_tokenize(text) if len(sentences) < num_paragraphs: return sentences stop_words = set(stopwords.words('norwegian')) vectorizer = TfidfVectorizer(stop_words=list(stop_words)) X = vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(X, X) for i in range(len(similarity_matrix)): # threshold for j in range(len(similarity_matrix[i])): if similarity_matrix[i][j] < threshold: similarity_matrix[i][j] = 0.0 nx_graph = nx.from_numpy_array(similarity_matrix) scores = nx.pagerank(nx_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) summary = [ranked_sentences[i][1] for i in range(num_paragraphs)] return summary def text_rank_summary(text, num_paragraphs=3): sentences = nltk.sent_tokenize(text) if len(sentences) < num_paragraphs: return sentences stop_words = set(stopwords.words('norwegian')) vectorizer = TfidfVectorizer(stop_words=list(stop_words)) X = vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(X, X) nx_graph = nx.from_numpy_array(similarity_matrix) # graph, nodes (i.e sentences) & edges are similarity scores (is cool) scores = nx.pagerank(nx_graph) # PageRank algorithm, scoring sentences ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # rank by PageRank scores summary = [ranked_sentences[i][1] for i in range(num_paragraphs)] # top sentences for summary return ' '.join(summary) banner_html = """