Spaces:

camparchimedes
/

nb

Build error

App Files Files

nb / app.py

camparchimedes

Update app.py

351252d verified 10 months ago

raw

history blame

10.3 kB

	"""
	Version: 4th_pruned_optimized_transcription_app.py

	Description: webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummering, pdf-download.
	"""

	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import time
	import os
	import warnings
	from pydub import AudioSegment
	import torch
	import torchaudio
	from transformers import pipeline
	from huggingface_hub import model_info
	import spacy
	import networkx as nx
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import pandas as pd
	import numpy as np
	import re
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import gradio as gr
	from fpdf import FPDF
	from PIL import Image

	# Suppress warnings
	warnings.filterwarnings("ignore")

	# Convert m4a audio to wav format
	def convert_to_wav(audio_file):
	audio = AudioSegment.from_file(audio_file, format="m4a")
	wav_file = "temp.wav"
	audio.export(wav_file, format="wav")
	return wav_file


	# D3efine model
	MODEL_NAME = "NbAiLab/nb-whisper-large"
	lang = "no"

	# Initialize device for torch
	device = 0 if torch.cuda.is_available() else "cpu"

	# Define pipeline config
	pipe = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	chunk_length_s=30,
	device=device,
	)

	#pipe.model.config.pad_token_id = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")

	# # Set eos_token_id and pad_token_id to different values
	pipe.model.config.eos_token_id = 0
	pipe.model.config.pad_token_id = 1

	# OR
	pipe.model.config.pad_token_id = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")


	assert pipe.model.config.eos_token_id != pipe.model.config.pad_token_id
	"eos_token_id and pad_token_id must be different"


	# Transcribe audio
	def transcribe_audio(audio_file):
	if audio_file.endswith(".m4a"):
	audio_file = convert_to_wav(audio_file)

	# Load using torchaudio
	waveform, sample_rate = torchaudio.load(audio_file)

	start_time = time.time()






	text = pipe(waveform, sampling_rate=sample_rate)["text"]


	output_time = time.time() - start_time

	# Calculate audio duration (in seconds)
	audio_duration = waveform.shape[1] / sample_rate

	# Find audio duration@pipeline's internal method
	#audio_duration = pipe.feature_extractor.sampling_rate * len(pipe.feature_extractor(audio_file)["input_features"][0]) / pipe.feature_extractor.sampling_rate

	# Real-time Factor calculation
	rtf = output_time / audio_duration

	# Format of the result
	result = (
	f"Time taken: {output_time:.2f} seconds\n"
	f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
	f"Real-time Factor (RTF): {rtf:.2f}\n"
	f"Number of words: {len(text.split())}\n\n"
	"Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
	"It is the ratio of transcription time to the duration of the audio.\n\n"
	"An RTF of less than 1 means the transcription process is faster than real-time (expected)."
	)

	return text, result


	# Clean and preprocess text for summarization
	def clean_text(text):
	text = re.sub(r'https?:\/\/.[\r\n]', '', text)
	text = re.sub(r'[^\w\s]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	nlp = spacy.blank("nb") # 'nb' is code for Norwegian Bokmål
	spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS

	def preprocess_text(text):
	# Process the text with SpaCy
	doc = nlp(text)
	# Use SpaCy's stop words directly
	stop_words = spacy_stop_words
	# Filter out stop words
	words = [token.text for token in doc if token.text.lower() not in stop_words]
	return ' '.join(words)

	# Summarize text using the T5 model
	def summarize_text(text):
	preprocessed_text = preprocess_text(text)
	inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
	inputs = inputs.to(device)
	summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
	return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	# Build similarity matrix for graph-based summary
	def build_similarity_matrix(sentences, stop_words):
	similarity_matrix = nx.Graph()
	for i, tokens_a in enumerate(sentences):
	for j, tokens_b in enumerate(sentences):
	if i != j:
	common_words = set(tokens_a) & set(tokens_b)
	similarity_matrix.add_edge(i, j, weight=len(common_words))
	return similarity_matrix

	# Graph-based summarization
	def graph_based_summary(text, num_paragraphs=3):
	doc = nlp(text)
	sentences = [sent.text for sent in doc.sents]
	if len(sentences) < num_paragraphs:
	return sentences

	sentence_tokens = [nlp(sent) for sent in sentences]
	stop_words = spacy_stop_words
	filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens]
	similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)

	scores = nx.pagerank(similarity_matrix)
	ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
	return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])

	# LexRank summarization
	def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
	doc = nlp(text)
	sentences = [sent.text for sent in doc.sents]
	if len(sentences) < num_paragraphs:
	return sentences

	stop_words = spacy_stop_words
	vectorizer = TfidfVectorizer(stop_words=list(stop_words))
	X = vectorizer.fit_transform(sentences)
	similarity_matrix = cosine_similarity(X, X)

	# Apply threshold to the similarity matrix
	similarity_matrix[similarity_matrix < threshold] = 0
	nx_graph = nx.from_numpy_array(similarity_matrix)
	scores = nx.pagerank(nx_graph)
	ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
	return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])

	# TextRank summarization
	def text_rank_summary(text, num_paragraphs=3):
	doc = nlp(text)
	sentences = [sent.text for sent in doc.sents]
	if len(sentences) < num_paragraphs:
	return sentences

	stop_words = spacy_stop_words
	vectorizer = TfidfVectorizer(stop_words=list(stop_words))
	X = vectorizer.fit_transform(sentences)
	similarity_matrix = cosine_similarity(X, X)

	nx_graph = nx.from_numpy_array(similarity_matrix)
	scores = nx.pagerank(nx_graph)
	ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
	return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])


	# Save text and summary to PDF
	def save_to_pdf(text, summary):
	pdf = FPDF()
	pdf.add_page()
	pdf.set_font("Arial", size=12)

	if text:
	pdf.multi_cell(0, 10, "Text:\n" + text)

	pdf.ln(10) # Paragraph space

	if summary:
	pdf.multi_cell(0, 10, "Summary:\n" + summary)

	pdf_output_path = "transcription.pdf"
	pdf.output(pdf_output_path)
	return pdf_output_path

	def _return_img_html_embed(img_url):
	HTML_str = (
	f'<center><img src="{img_url}" alt="Imagerine" style="width:100%; height:auto;"></center>'
	)
	return HTML_str

	# Gradio Interface
	def display_image():
	img_url = "https://huggingface.co/spaces/camparchimedes/transcription_app/blob/main/picture.png"
	html_embed_str = _return_img_html_embed(img_url)
	return html_embed_str

	iface = gr.Blocks()

	with iface:
	gr.HTML(display_image())
	gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")

	with gr.Tabs():
	with gr.TabItem("Transcription"):
	audio_input = gr.Audio(type="filepath")
	text_output = gr.Textbox(label="Text")
	result_output = gr.Textbox(label="Time taken and Number of words")
	transcribe_button = gr.Button("Transcribe")

	transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])

	with gr.TabItem("Summary \| Graph-based"):
	summary_output = gr.Textbox(label="Summary \| Graph-based")
	summarize_button = gr.Button("Summarize")

	summarize_button.click(fn=lambda text: graph_based_summary(text), inputs=[text_output], outputs=[summary_output])

	with gr.TabItem("Summary \| LexRank"):
	summary_output = gr.Textbox(label="Summary \| LexRank")
	summarize_button = gr.Button("Summarize")

	summarize_button.click(fn=lambda text: lex_rank_summary(text), inputs=[text_output], outputs=[summary_output])

	with gr.TabItem("Summary \| TextRank"):
	summary_output = gr.Textbox(label="Summary \| TextRank")
	summarize_button = gr.Button("Summarize")

	summarize_button.click(fn=lambda text: text_rank_summary(text), inputs=[text_output], outputs=[summary_output])

	with gr.TabItem("Download PDF"):
	pdf_text_only = gr.Button("Download PDF with Text Only")
	pdf_summary_only = gr.Button("Download PDF with Summary Only")
	pdf_both = gr.Button("Download PDF with Both")

	pdf_output = gr.File(label="Download PDF")

	pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
	pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output], outputs=[pdf_output])
	pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output], outputs=[pdf_output])

	iface.launch(share=True, debug=True)