nb / app.py
camparchimedes's picture
Update app.py
de0e923 verified
raw
history blame
8.48 kB
import time
import os
import warnings
from pydub import AudioSegment
import torch
from torch import torchaudio
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
from huggingface_hub import model_info
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr
from fpdf import FPDF
from PIL import Image
# Suppress warnings
warnings.filterwarnings("ignore")
# Ascertain that NLTK dependencies for Norwegian are downloaded
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('norwegian'))
# Convert m4a audio to wav format
def convert_to_wav(audio_file):
audio = AudioSegment.from_file(audio_file, format="m4a")
wav_file = "temp.wav"
audio.export(wav_file, format="wav")
return wav_file
# Initialize device for torch
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load Whisper model and processor directly using the transformers library
whisper_processor = WhisperProcessor.from_pretrained("NbAiLab/nb-whisper-large")
whisper_model = WhisperForConditionalGeneration.from_pretrained("NbAiLab/nb-whisper-large").to(device)
def transcribe_audio(audio_file):
if audio_file.endswith(".m4a"):
audio_file = convert_to_wav(audio_file)
start_time = time.time()
# Load the audio file using torchaudio
waveform, sample_rate = torchaudio.load(audio_file)
# Process the waveform with Whisper's processor
input_features = whisper_processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_features.to(device)
# Generate the transcription
output = whisper_model.generate(input_features=input_features)
# Decode the output
text = whisper_processor.batch_decode(output, skip_special_tokens=True)[0]
output_time = time.time() - start_time
result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {len(text.split())}"
return text, result
# Clean and preprocess text for summarization
def clean_text(text):
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def preprocess_text(text):
words = word_tokenize(text, language="norwegian")
stop_words = set(stopwords.words('norwegian'))
words = [word for word in words if word.lower() not in stop_words]
return ' '.join(words)
# Summarize text using the T5 model
def summarize_text(text):
preprocessed_text = preprocess_text(text)
inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
inputs = inputs.to(device)
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Build similarity matrix for graph-based summary
def build_similarity_matrix(sentences, stop_words):
similarity_matrix = nx.Graph()
for i, tokens_a in enumerate(sentences):
for j, tokens_b in enumerate(sentences):
if i != j:
common_words = set(tokens_a) & set(tokens_b)
similarity_matrix.add_edge(i, j, weight=len(common_words))
return similarity_matrix
# Graph-based summarization
def graph_based_summary(text, num_paragraphs=3):
sentences = nltk.sent_tokenize(text)
if len(sentences) < num_paragraphs:
return sentences
sentence_tokens = [word_tokenize(sent) for sent in sentences]
stop_words = set(stopwords.words('norwegian'))
filtered_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in sentence_tokens]
similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)
scores = nx.pagerank(similarity_matrix)
ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
# LexRank summarization
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
sentences = nltk.sent_tokenize(text)
if len(sentences) < num_paragraphs:
return sentences
stop_words = set(stopwords.words('norwegian'))
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
X = vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(X, X)
# Apply threshold to the similarity matrix
similarity_matrix[similarity_matrix < threshold] = 0
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
# TextRank summarization
def text_rank_summary(text, num_paragraphs=3):
sentences = nltk.sent_tokenize(text)
if len(sentences) < num_paragraphs:
return sentences
stop_words = set(stopwords.words('norwegian'))
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
X = vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(X, X)
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
# Save text and summary to PDF
def save_to_pdf(text, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
if text:
pdf.multi_cell(0, 10, "Text:\n" + text)
pdf.ln(10) # Paragraph space
if summary:
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
# Gradio Interface
iface = gr.Blocks()
with iface:
gr.HTML("""
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/banner_trans.png" alt="" width="100%" height="auto">
</div>
""")
gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
with gr.Tabs():
with gr.TabItem("Transcription"):
audio_input = gr.Audio(type="filepath")
text_output = gr.Textbox(label="Text")
result_output = gr.Textbox(label="Time taken and Number of words")
transcribe_button = gr.Button("Transcribe")
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
with gr.TabItem("Summary | Graph-based"):
summary_output = gr.Textbox(label="Summary | Graph-based")
summarize_button = gr.Button("Summarize")
summarize_button.click(fn=lambda text: graph_based_summary(text), inputs=[text_output], outputs=[summary_output])
with gr.TabItem("Summary | LexRank"):
summary_output = gr.Textbox(label="Summary | LexRank")
summarize_button = gr.Button("Summarize")
summarize_button.click(fn=lambda text: lex_rank_summary(text), inputs=[text_output], outputs=[summary_output])
with gr.TabItem("Summary | TextRank"):
summary_output = gr.Textbox(label="Summary | TextRank")
summarize_button = gr.Button("Summarize")
summarize_button.click(fn=lambda text: text_rank_summary(text), inputs=[text_output], outputs=[summary_output])
with gr.TabItem("Download PDF"):
pdf_text_only = gr.Button("Download PDF with Text Only")
pdf_summary_only = gr.Button("Download PDF with Summary Only")
pdf_both = gr.Button("Download PDF with Both")
pdf_output = gr.File(label="Download PDF")
pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output], outputs=[pdf_output])
pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output], outputs=[pdf_output])
iface.launch(share=True, debug=True)