# app.py # Version: 1.07 (08.24.24), ALPHA #--------------------------------------------------------------------------------------------------------------------------------------------- # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #--------------------------------------------------------------------------------------------------------------------------------------------- import spaces import gradio as gr from PIL import Image from pydub import AudioSegment import os import re import time import warnings #import datetime import subprocess from pathlib import Path from fpdf import FPDF import psutil from gpuinfo import GPUInfo #import pandas as pd #import csv import numpy as np import torch import torchaudio import torchaudio.transforms as transforms from transformers import pipeline, AutoModel import spacy import networkx as nx from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity warnings.filterwarnings("ignore") # ------------header section------------ HEADER_INFO = """ # WEB APP ✨| Norwegian WHISPER Model Switch Work [Transkribering av lydfiler til norsk skrift] """.strip() LOGO = "https://cdn-lfs-us-1.huggingface.co/repos/fe/3b/fe3bd7c8beece8b087fddcc2278295e7f56c794c8dcf728189f4af8bddc585e1/5112f67899d65e9797a7a60d05f983cf2ceefbe2f7cba74eeca93a4e7061becc?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27logo.png%3B+filename%3D%22logo.png%22%3B&response-content-type=image%2Fpng&Expires=1724881270&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNDg4MTI3MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2ZlLzNiL2ZlM2JkN2M4YmVlY2U4YjA4N2ZkZGNjMjI3ODI5NWU3ZjU2Yzc5NGM4ZGNmNzI4MTg5ZjRhZjhiZGRjNTg1ZTEvNTExMmY2Nzg5OWQ2NWU5Nzk3YTdhNjBkMDVmOTgzY2YyY2VlZmJlMmY3Y2JhNzRlZWNhOTNhNGU3MDYxYmVjYz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=ipo8wTjtC7R0QHbo%7Et9Q5CTaI3cZKxM0beajqlApfm5fh7%7EW-FULu1-ISL5bkowBSw9m5RdGoyOqj336OSS5fPD%7EnzYNmAMd3T5bx2-KfCDh6jz0HVECt8S7HeIu%7El2TetxrzL2tdHw4Np4Zpa8JKOnNnje24fF0Nr-xUS2dvPJf54rIL70-iWVXXhw8owxt0%7E1CJsUHC9oibp9B4mZcyWvvRldhDopiQBYELusZdTW3qvtTBK083WP3gHQxadQp8UDVTPZ0g3i112G2NfFJB%7Epa70XeN8m3E6ORx6pVH%7EW6IzjvmapWSF-tmXH-26wYG8aof%7E1U7enbR1w2QBTS-g__&Key-Pair-Id=K24J24Z295AEI9" SIDEBAR_INFO = f"""
""" # ------------transcribe section------------ @spaces.GPU() def convert_to_wav(filepath): _, file_ending = os.path.splitext(f'{filepath}') audio_file = filepath.replace(file_ending, ".wav") os.system(f'ffmpeg -i "{filepath}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"') return audio_file pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, generate_kwargs={'task': 'transcribe', 'language': 'no'}) @spaces.GPU() def transcribe_audio(audio_file, batch_size=16, sample_rate =16000): audio_file = audio_tuple[0] # assumes first element of the tuple contains the file path; waveform, sample_rate = torchaudio.load(audio_file) # to avoid TypeError here if waveform.ndim > 1: waveform = waveform[0, :] waveform = waveform.numpy() start_time = time.time() # --pipe it with torch.no_grad(): outputs = pipe(waveform, sampling_rate=sample_rate, batch_size=batch_size, return_timestamps=False) end_time = time.time() output_time = end_time - start_time word_count = len(text.split()) # --GPU metrics memory = psutil.virtual_memory() gpu_utilization, gpu_memory = GPUInfo.gpu_usage() gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0 gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0 # --CPU metric cpu_usage = psutil.cpu_percent(interval=1) # --system info string system_info = f""" *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.* *Processing time: {output_time:.2f} seconds.* *Number of words: {word_count}* *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}* *CPU Usage: {cpu_usage}%* """ return text.strip(), system_info # ------------summary section------------ # ------------for app integration later------------ @spaces.GPU() def clean_text(text): text = re.sub(r'https?:\/\/.*[\r\n]*', '', text) text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text nlp = spacy.blank("nb") # 'nb' ==> codename = Norwegian Bokmål nlp.add_pipe('sentencizer') spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS summarization_model = AutoModel.from_pretrained("NbAiLab/nb-bert-large") # pipe = pipeline("fill-mask", model="NbAiLab/nb-bert-large") @spaces.GPU() def preprocess_text(text): # Process the text with SpaCy doc = nlp(text) # SpaCy's stop top wrds direct stop_words = spacy_stop_words # Filter out stop words words = [token.text for token in doc if token.text.lower() not in stop_words] return ' '.join(words) @spaces.GPU() def summarize_text(text): preprocessed_text = preprocess_text(text) inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True) inputs = inputs.to(device) summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True) return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True) @spaces.GPU() def build_similarity_matrix(sentences, stop_words): similarity_matrix = nx.Graph() for i, tokens_a in enumerate(sentences): for j, tokens_b in enumerate(sentences): if i != j: common_words = set(tokens_a) & set(tokens_b) similarity_matrix.add_edge(i, j, weight=len(common_words)) return similarity_matrix # PageRank @spaces.GPU() def graph_based_summary(text, num_paragraphs=3): doc = nlp(text) sentences = [sent.text for sent in doc.sents] if len(sentences) < num_paragraphs: return ' '.join(sentences) sentence_tokens = [nlp(sent) for sent in sentences] stop_words = spacy_stop_words filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens] similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words) scores = nx.pagerank(similarity_matrix) ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True) return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]]) # LexRank @spaces.GPU() def lex_rank_summary(text, num_paragraphs=3, threshold=0.1): doc = nlp(text) sentences = [sent.text for sent in doc.sents] if len(sentences) < num_paragraphs: return ' '.join(sentences) # Adjusted to return a single string stop_words = spacy_stop_words vectorizer = TfidfVectorizer(stop_words=list(stop_words)) X = vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(X, X) # Apply threshold@similarity matrix similarity_matrix[similarity_matrix < threshold] = 0 nx_graph = nx.from_numpy_array(similarity_matrix) scores = nx.pagerank(nx_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)]) # TextRank @spaces.GPU() def text_rank_summary(text, num_paragraphs=3): doc = nlp(text) sentences = [sent.text for sent in doc.sents] if len(sentences) < num_paragraphs: return ' '.join(sentences) stop_words = spacy_stop_words vectorizer = TfidfVectorizer(stop_words=list(stop_words)) X = vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(X, X) nx_graph = nx.from_numpy_array(similarity_matrix) scores = nx.pagerank(nx_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)]) iface = gr.Blocks() with iface: gr.HTML(SIDEBAR_INFO) gr.Markdown(HEADER_INFO) audio_input = gr.Audio(label="Upload Audio File") transcribed_text = gr.Textbox(label="Transcribed Text") system_info = gr.Textbox(label="System Info") transcribe_button = gr.Button("Transcribe") transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=[transcribed_text, system_info]) iface.launch(share=True, debug=True)