Spaces:

camparchimedes
/

nb

Build error

App Files Files

nb / app.py

camparchimedes

Update app.py

1de6e28 verified 10 months ago

raw

history blame

13 kB

	# app.py
	# Version: 1.07 (08.24.24), ALPHA
	#---------------------------------------------------------------------------------------------------------------------------------------------
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#---------------------------------------------------------------------------------------------------------------------------------------------
	import spaces
	import gradio as gr
	from PIL import Image
	from pydub import AudioSegment
	import os
	import re
	import time
	import warnings
	#import datetime
	import subprocess
	from pathlib import Path
	from fpdf import FPDF

	import psutil
	from gpuinfo import GPUInfo
	#import pandas as pd
	#import csv
	import numpy as np
	import torch
	#import torchaudio
	#import torchaudio.transforms as transforms

	from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

	import spacy
	import networkx as nx
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	warnings.filterwarnings("ignore")

	# ------------header section------------
	HEADER_INFO = """
	# WEB APP ✨\| Norwegian WHISPER Model
	Switch Work [Transkribering av lydfiler til norsk skrift]
	""".strip()
	LOGO = "https://cdn-lfs-us-1.huggingface.co/repos/fe/3b/fe3bd7c8beece8b087fddcc2278295e7f56c794c8dcf728189f4af8bddc585e1/5112f67899d65e9797a7a60d05f983cf2ceefbe2f7cba74eeca93a4e7061becc?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27logo.png%3B+filename%3D%22logo.png%22%3B&response-content-type=image%2Fpng&Expires=1724881270&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNDg4MTI3MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2ZlLzNiL2ZlM2JkN2M4YmVlY2U4YjA4N2ZkZGNjMjI3ODI5NWU3ZjU2Yzc5NGM4ZGNmNzI4MTg5ZjRhZjhiZGRjNTg1ZTEvNTExMmY2Nzg5OWQ2NWU5Nzk3YTdhNjBkMDVmOTgzY2YyY2VlZmJlMmY3Y2JhNzRlZWNhOTNhNGU3MDYxYmVjYz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=ipo8wTjtC7R0QHbo%7Et9Q5CTaI3cZKxM0beajqlApfm5fh7%7EW-FULu1-ISL5bkowBSw9m5RdGoyOqj336OSS5fPD%7EnzYNmAMd3T5bx2-KfCDh6jz0HVECt8S7HeIu%7El2TetxrzL2tdHw4Np4Zpa8JKOnNnje24fF0Nr-xUS2dvPJf54rIL70-iWVXXhw8owxt0%7E1CJsUHC9oibp9B4mZcyWvvRldhDopiQBYELusZdTW3qvtTBK083WP3gHQxadQp8UDVTPZ0g3i112G2NfFJB%7Epa70XeN8m3E6ORx6pVH%7EW6IzjvmapWSF-tmXH-26wYG8aof%7E1U7enbR1w2QBTS-g__&Key-Pair-Id=K24J24Z295AEI9"
	SIDEBAR_INFO = f"""
	<div align="center">
	<img src="{LOGO}" style="width: 100%; height: auto;"/>
	</div>
	"""

	# ------------transcribe section------------
	dtype = torch.bfloat32
	device = "cuda" if torch.cuda.is_available() else "cpu"

	@spaces.GPU()
	def convert_to_wav(filepath):
	_, file_ending = os.path.splitext(f'{filepath}')
	audio_file = filepath.replace(file_ending, ".wav")
	os.system(f'ffmpeg -i "{filepath}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
	return audio_file

	pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, torch_dtype=dtype).to(device)

	@spaces.GPU()
	def transcribe_audio(audio_file, batch_size=16):
	start_time = time.time()

	outputs = pipe(audio_file, batch_size=batch_size, return_timestamps=False, generate_kwargs={'task': 'transcribe', 'language': 'no'}) # skip_special_tokens=True
	text = outputs["text"]

	end_time = time.time()

	output_time = end_time - start_time
	word_count = len(text.split())

	# GPU usage
	memory = psutil.virtual_memory()
	gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
	gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
	gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0

	# CPU usage
	cpu_usage = psutil.cpu_percent(interval=1)

	# System info string
	system_info = f"""
	Memory: {memory.total / (1024 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
	Processing time: {output_time:.2f} seconds.
	Number of words: {word_count}
	GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}
	CPU Usage: {cpu_usage}%
	"""

	return text.strip(), system_info

	# ------------summary section------------
	@spaces.GPU()
	def clean_text(text):
	text = re.sub(r'https?:\/\/.[\r\n]', '', text)
	text = re.sub(r'[^\w\s]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	nlp = spacy.blank("nb") # 'nb' ==> codename = Norwegian Bokmål
	nlp.add_pipe('sentencizer')
	spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS

	@spaces.GPU()
	def preprocess_text(text):
	# Process the text with SpaCy
	doc = nlp(text)
	# SpaCy's stop top wrds direct
	stop_words = spacy_stop_words
	# Filter out stop words
	words = [token.text for token in doc if token.text.lower() not in stop_words]
	return ' '.join(words)

	# Summarize w/T5 model
	@spaces.GPU()
	def summarize_text(text):
	preprocessed_text = preprocess_text(text)
	inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
	inputs = inputs.to(device)
	summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
	return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	def build_similarity_matrix(sentences, stop_words):
	similarity_matrix = nx.Graph()
	for i, tokens_a in enumerate(sentences):
	for j, tokens_b in enumerate(sentences):
	if i != j:
	common_words = set(tokens_a) & set(tokens_b)
	similarity_matrix.add_edge(i, j, weight=len(common_words))
	return similarity_matrix

	# PageRank
	def graph_based_summary(text, num_paragraphs=3):
	doc = nlp(text)
	sentences = [sent.text for sent in doc.sents]
	if len(sentences) < num_paragraphs:
	return ' '.join(sentences)

	sentence_tokens = [nlp(sent) for sent in sentences]
	stop_words = spacy_stop_words
	filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens]
	similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)

	scores = nx.pagerank(similarity_matrix)
	ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
	return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])

	# LexRank
	def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):

	doc = nlp(text)
	sentences = [sent.text for sent in doc.sents]
	if len(sentences) < num_paragraphs:
	return ' '.join(sentences) # Adjusted to return a single string

	stop_words = spacy_stop_words
	vectorizer = TfidfVectorizer(stop_words=list(stop_words))
	X = vectorizer.fit_transform(sentences)
	similarity_matrix = cosine_similarity(X, X)

	# Apply threshold@similarity matrix
	similarity_matrix[similarity_matrix < threshold] = 0
	nx_graph = nx.from_numpy_array(similarity_matrix)
	scores = nx.pagerank(nx_graph)
	ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
	return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])

	# TextRank
	def text_rank_summary(text, num_paragraphs=3):

	doc = nlp(text)
	sentences = [sent.text for sent in doc.sents]
	if len(sentences) < num_paragraphs:
	return ' '.join(sentences)

	stop_words = spacy_stop_words
	vectorizer = TfidfVectorizer(stop_words=list(stop_words))
	X = vectorizer.fit_transform(sentences)
	similarity_matrix = cosine_similarity(X, X)

	nx_graph = nx.from_numpy_array(similarity_matrix)
	scores = nx.pagerank(nx_graph)
	ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
	return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])


	# Save text+summary/PDF
	def save_to_pdf(text, summary):
	pdf = FPDF()
	pdf.add_page()
	pdf.set_font("Arial", size=12)

	if text:
	pdf.multi_cell(0, 10, "Text:\n" + text)

	pdf.ln(10) # Paragraph space

	if summary:
	pdf.multi_cell(0, 10, "Summary:\n" + summary)

	pdf_output_path = "transcription.pdf"
	pdf.output(pdf_output_path)
	return pdf_output_path

	iface = gr.Blocks()

	with iface:

	gr.HTML(SIDEBAR_INFO)
	gr.Markdown(HEADER_INFO)

	with gr.Tabs():

	with gr.TabItem("Summary \| PageRank"):
	text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
	summary_output_graph = gr.Textbox(label="PageRank \| token-based similarity")

	gr.Markdown("""
	token-based: similarity matrix edge weights representing token overlap/
	ranked by their centrality in the graph (good with dense inter-sentence relationships)
	""")
	gr.Markdown("""
	Bjørn: gir sammendrag som fanger opp de mest relevante setninger i teksten
	""")

	summarize_transcribed_button_graph = gr.Button("Summary of Transcribed Text, Click Here")
	summarize_transcribed_button_graph.click(fn=lambda text: graph_based_summary(text), inputs=[text_output], outputs=[summary_output_graph])
	summarize_uploaded_button_graph = gr.Button("Upload Text to Summarize, Click Here")
	summarize_uploaded_button_graph.click(fn=graph_based_summary, inputs=[text_input_graph], outputs=[summary_output_graph])

	with gr.TabItem("Summary \| LexRank"):
	text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
	summary_output_lex = gr.Textbox(label="LexRank \| cosine similarity")

	gr.Markdown("""
	semantic: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
	(good for sparse graph structures with thresholding)
	""")
	gr.Markdown("""
	Bjørn: gir sammendrag som best fanger opp betydningen av hele teksten
	""")

	summarize_transcribed_button_lex = gr.Button("Summary of Transcribed Text, Click Here")
	summarize_transcribed_button_lex.click(fn=lambda text: lex_rank_summary(text), inputs=[text_output], outputs=[summary_output_lex])
	summarize_uploaded_button_lex = gr.Button("Upload Text to Summarize, Click Here")
	summarize_uploaded_button_lex.click(fn=lex_rank_summary, inputs=[text_input_lex], outputs=[summary_output_lex])

	with gr.TabItem("Summary \| TextRank"):
	text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
	summary_output_text_rank = gr.Textbox(label="TextRank \| lexical similarity")

	gr.Markdown("""
	sentence: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
	""")
	gr.Markdown("""
	Bjørn: sammendrag basert på i de setningene som ligner mest på hverandre fra teksten

	""")

	summarize_transcribed_button_text_rank = gr.Button("Summary of Transcribed Text, Click Here")
	summarize_transcribed_button_text_rank.click(fn=lambda text: text_rank_summary(text), inputs=[text_output], outputs=[summary_output_text_rank])
	summarize_uploaded_button_text_rank = gr.Button("Upload Text to Summarize, Click Here")
	summarize_uploaded_button_text_rank.click(fn=text_rank_summary, inputs=[text_input_text_rank], outputs=[summary_output_text_rank])

	with gr.TabItem("Download PDF"):
	pdf_text_only = gr.Button("Download PDF with Text Only")
	pdf_summary_only = gr.Button("Download PDF with Summary Only")
	pdf_both = gr.Button("Download PDF with Both")

	pdf_output = gr.File(label="Download PDF")

	pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
	pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output]) # Includes all summary outputs
	pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output_graph], outputs=[pdf_output]) # Defaulting to Graph-based summary

	iface.launch(share=True, debug=True)