Spaces:

camparchimedes
/

nb

Build error

App Files Files

nb / app.py

camparchimedes

Update app.py

de0e923 verified about 1 year ago

raw

history blame

8.48 kB

	import time
	import os
	import warnings
	from pydub import AudioSegment
	import torch
	from torch import torchaudio
	from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
	from huggingface_hub import model_info
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	import networkx as nx
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import pandas as pd
	import numpy as np
	import re
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import gradio as gr
	from fpdf import FPDF
	from PIL import Image

	# Suppress warnings
	warnings.filterwarnings("ignore")

	# Ascertain that NLTK dependencies for Norwegian are downloaded
	nltk.download('punkt', quiet=True)
	nltk.download('stopwords', quiet=True)
	stop_words = set(stopwords.words('norwegian'))


	# Convert m4a audio to wav format
	def convert_to_wav(audio_file):
	audio = AudioSegment.from_file(audio_file, format="m4a")
	wav_file = "temp.wav"
	audio.export(wav_file, format="wav")
	return wav_file

	# Initialize device for torch
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load Whisper model and processor directly using the transformers library
	whisper_processor = WhisperProcessor.from_pretrained("NbAiLab/nb-whisper-large")
	whisper_model = WhisperForConditionalGeneration.from_pretrained("NbAiLab/nb-whisper-large").to(device)

	def transcribe_audio(audio_file):
	if audio_file.endswith(".m4a"):
	audio_file = convert_to_wav(audio_file)

	start_time = time.time()

	# Load the audio file using torchaudio
	waveform, sample_rate = torchaudio.load(audio_file)

	# Process the waveform with Whisper's processor
	input_features = whisper_processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_features.to(device)

	# Generate the transcription
	output = whisper_model.generate(input_features=input_features)

	# Decode the output
	text = whisper_processor.batch_decode(output, skip_special_tokens=True)[0]

	output_time = time.time() - start_time
	result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {len(text.split())}"

	return text, result

	# Clean and preprocess text for summarization
	def clean_text(text):
	text = re.sub(r'https?:\/\/.[\r\n]', '', text)
	text = re.sub(r'[^\w\s]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def preprocess_text(text):
	words = word_tokenize(text, language="norwegian")
	stop_words = set(stopwords.words('norwegian'))
	words = [word for word in words if word.lower() not in stop_words]
	return ' '.join(words)

	# Summarize text using the T5 model
	def summarize_text(text):
	preprocessed_text = preprocess_text(text)
	inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
	inputs = inputs.to(device)
	summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
	return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	# Build similarity matrix for graph-based summary
	def build_similarity_matrix(sentences, stop_words):
	similarity_matrix = nx.Graph()
	for i, tokens_a in enumerate(sentences):
	for j, tokens_b in enumerate(sentences):
	if i != j:
	common_words = set(tokens_a) & set(tokens_b)
	similarity_matrix.add_edge(i, j, weight=len(common_words))
	return similarity_matrix

	# Graph-based summarization
	def graph_based_summary(text, num_paragraphs=3):
	sentences = nltk.sent_tokenize(text)
	if len(sentences) < num_paragraphs:
	return sentences

	sentence_tokens = [word_tokenize(sent) for sent in sentences]
	stop_words = set(stopwords.words('norwegian'))
	filtered_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in sentence_tokens]
	similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)

	scores = nx.pagerank(similarity_matrix)
	ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
	return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])

	# LexRank summarization
	def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
	sentences = nltk.sent_tokenize(text)
	if len(sentences) < num_paragraphs:
	return sentences

	stop_words = set(stopwords.words('norwegian'))
	vectorizer = TfidfVectorizer(stop_words=list(stop_words))
	X = vectorizer.fit_transform(sentences)
	similarity_matrix = cosine_similarity(X, X)

	# Apply threshold to the similarity matrix
	similarity_matrix[similarity_matrix < threshold] = 0
	nx_graph = nx.from_numpy_array(similarity_matrix)
	scores = nx.pagerank(nx_graph)
	ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
	return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])

	# TextRank summarization
	def text_rank_summary(text, num_paragraphs=3):
	sentences = nltk.sent_tokenize(text)
	if len(sentences) < num_paragraphs:
	return sentences

	stop_words = set(stopwords.words('norwegian'))
	vectorizer = TfidfVectorizer(stop_words=list(stop_words))
	X = vectorizer.fit_transform(sentences)
	similarity_matrix = cosine_similarity(X, X)

	nx_graph = nx.from_numpy_array(similarity_matrix)
	scores = nx.pagerank(nx_graph)
	ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
	return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])

	# Save text and summary to PDF
	def save_to_pdf(text, summary):
	pdf = FPDF()
	pdf.add_page()
	pdf.set_font("Arial", size=12)

	if text:
	pdf.multi_cell(0, 10, "Text:\n" + text)

	pdf.ln(10) # Paragraph space

	if summary:
	pdf.multi_cell(0, 10, "Summary:\n" + summary)

	pdf_output_path = "transcription.pdf"
	pdf.output(pdf_output_path)
	return pdf_output_path

	# Gradio Interface
	iface = gr.Blocks()

	with iface:
	gr.HTML("""
	<div style="text-align: center;">
	<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/banner_trans.png" alt="" width="100%" height="auto">
	</div>
	""")
	gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")

	with gr.Tabs():
	with gr.TabItem("Transcription"):
	audio_input = gr.Audio(type="filepath")
	text_output = gr.Textbox(label="Text")
	result_output = gr.Textbox(label="Time taken and Number of words")
	transcribe_button = gr.Button("Transcribe")

	transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])

	with gr.TabItem("Summary \| Graph-based"):
	summary_output = gr.Textbox(label="Summary \| Graph-based")
	summarize_button = gr.Button("Summarize")

	summarize_button.click(fn=lambda text: graph_based_summary(text), inputs=[text_output], outputs=[summary_output])

	with gr.TabItem("Summary \| LexRank"):
	summary_output = gr.Textbox(label="Summary \| LexRank")
	summarize_button = gr.Button("Summarize")

	summarize_button.click(fn=lambda text: lex_rank_summary(text), inputs=[text_output], outputs=[summary_output])

	with gr.TabItem("Summary \| TextRank"):
	summary_output = gr.Textbox(label="Summary \| TextRank")
	summarize_button = gr.Button("Summarize")

	summarize_button.click(fn=lambda text: text_rank_summary(text), inputs=[text_output], outputs=[summary_output])

	with gr.TabItem("Download PDF"):
	pdf_text_only = gr.Button("Download PDF with Text Only")
	pdf_summary_only = gr.Button("Download PDF with Summary Only")
	pdf_both = gr.Button("Download PDF with Both")

	pdf_output = gr.File(label="Download PDF")

	pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
	pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output], outputs=[pdf_output])
	pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output], outputs=[pdf_output])

	iface.launch(share=True, debug=True)