import time import os import spaces import contextlib import warnings warnings.filterwarnings("ignore") from pydub import AudioSegment # If m4a audio, convert to wav (Python) def convert_to_wav(audio_file): audio = AudioSegment.from_file(audio_file, format="m4a") wav_file = "temp.wav" audio.export(wav_file, format="wav") return wav_file import torch from transformers import AutoProcessor, pipeline # Initialize processor and pipeline processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # Set distinct pad and eos tokens if processor.tokenizer.pad_token_id is None: processor.tokenizer.pad_token_id = processor.tokenizer.convert_tokens_to_ids("[PAD]") if processor.tokenizer.eos_token_id is None: processor.tokenizer.eos_token_id = processor.tokenizer.convert_tokens_to_ids("[EOS]") pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", torch_dtype=torch_dtype) #language = "no" # task = "transcribe" def transcribe_audio(audio_file): if audio_file.endswith(".m4a"): audio_file = convert_to_wav(audio_file) start_time = time.time() with torch.no_grad(): output = pipe( audio_file, chunk_length_s=30, generate_kwargs={ # "task": task, "pad_token_id": processor.tokenizer.pad_token_id, "eos_token_id": processor.tokenizer.eos_token_id } ) text = output["text"] end_time = time.time() output_time = end_time - start_time word_count = len(text.split()) result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {word_count}" return text, result # [VERSION 3: full-on w/ 3 styles for summarization] import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords import networkx as nx from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import numpy as np import re nltk.download('punkt') nltk.download('stopwords') WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip())) def clean_text(text): text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE) text = re.sub(r'\', ' ', str(text)) text = re.sub(r'\'', '', str(text)) text = re.sub(r'«', '', str(text)) text = re.sub(r'»', '', str(text)) text = re.sub(r'–', '-', str(text)) text = re.sub(r'…', '.', str(text)) text = re.sub(r'[^\x00-\x7F]+', ' ', str(text)) return text def preprocess_text(text): try: words = word_tokenize(text) stop_words = set(stopwords.words('norwegian')) words_without_stopwords = [word for word in words if word.lower() not in stop_words] processed_text = ' '.join(words_without_stopwords) return processed_text except Exception as e: st.error(f"Error during text preprocessing: {e}") return None from transformers import AutoTokenizer, AutoModelForSeq2SeqLM summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16) summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base") summarization_model.to(device) def summarize_text(text): preprocessed_text = preprocess_text(text) if preprocessed_text is None: return None inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True) inputs = inputs.to(device) summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True) summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary def build_similarity_matrix(sentences, stop_words): similarity_matrix = nx.Graph() for i, tokens_a in enumerate(sentences): for j, tokens_b in enumerate(sentences): if i != j: common_words = set(tokens_a) & set(tokens_b) similarity_matrix.add_edge(i, j, weight=len(common_words)) return similarity_matrix def graph_based_summary(text, num_paragraphs=3): sentences = text.strip().split(".") if len(sentences) < num_paragraphs: return sentences sentence_tokens = [word_tokenize(sent) for sent in sentences] stop_words = set(stopwords.words('norwegian')) filtered_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in sentence_tokens] similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words) scores = nx.pagerank(similarity_matrix) ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True) summary = [sent for _, sent in ranked_sentences[:num_paragraphs]] return summary def lex_rank_summary(text, num_paragraphs=3, threshold=0.1): sentences = nltk.sent_tokenize(text) if len(sentences) < num_paragraphs: return sentences stop_words = set(stopwords.words('norwegian')) vectorizer = TfidfVectorizer(stop_words=list(stop_words)) X = vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(X, X) for i in range(len(similarity_matrix)): # threshold for j in range(len(similarity_matrix[i])): if similarity_matrix[i][j] < threshold: similarity_matrix[i][j] = 0.0 nx_graph = nx.from_numpy_array(similarity_matrix) scores = nx.pagerank(nx_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) summary = [ranked_sentences[i][1] for i in range(num_paragraphs)] return summary def text_rank_summary(text, num_paragraphs=3): sentences = nltk.sent_tokenize(text) if len(sentences) < num_paragraphs: return sentences stop_words = set(stopwords.words('norwegian')) vectorizer = TfidfVectorizer(stop_words=list(stop_words)) X = vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(X, X) nx_graph = nx.from_numpy_array(similarity_matrix) # graph, nodes (i.e sentences) & edges are similarity scores (is cool) scores = nx.pagerank(nx_graph) # PageRank algorithm, scoring sentences ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # rank by PageRank scores summary = [ranked_sentences[i][1] for i in range(num_paragraphs)] # top sentences for summary return ' '.join(summary) banner_html = """

""" # https://raw.huggingface.co/spaces/camparchimedes/transcription_app/blob/main/banner_trans.png import gradio as gr from fpdf import FPDF from PIL import Image def save_to_pdf(text, summary): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) if text: pdf.multi_cell(0, 10, "text:\n" + text) # paragraph space pdf.ln(10) if summary: pdf.multi_cell(0, 10, "Summary:\n" + summary) pdf_output_path = "transcription.pdf" pdf.output(pdf_output_path) return pdf_output_path iface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath"), outputs=gr.Textbox(label="Transcription"), title="SW Transcription App", description="Upload an audio file to get the text", theme="default", live=False ) iface = gr.Blocks() with iface: gr.HTML(banner_html) gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.") with gr.Tabs(): with gr.TabItem("Transcription"): audio_input = gr.Audio(type="filepath") text_output = gr.Textbox(label="text") result_output = gr.Textbox(label="Time taken and Number of words") transcribe_button = gr.Button("Transcribe") transcribe_button.click( fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output] ) with gr.TabItem("Summary_t1"): summary_output = gr.Textbox(label="Summary | Graph-based") summarize_button = gr.Button("Summarize") def summarize(text): if not text: return "Warning: a text must be available." summary = graph_based_summary(text) return summary summarize_button.click( fn=summarize, inputs=[text_output], outputs=summary_output ) with gr.TabItem("LexRank"): summary_output = gr.Textbox(label="Summary | LexRank") summarize_button = gr.Button("Summarize") def summarize(text): if not text: return "Warning: a text must be available." summary = lex_rank_summary(text) return summary summarize_button.click( fn=summarize, inputs=[text_output], outputs=summary_output ) with gr.TabItem("TextRank"): summary_output = gr.Textbox(label="Summary | TextRank") summarize_button = gr.Button("Summarize") def summarize(text): if not text: return "Warning: a text must be available." summary = text_rank_summary(text) return summary summarize_button.click( fn=summarize, inputs=[text_output], outputs=summary_output ) with gr.TabItem("Download PDF"): pdf_text_only = gr.Button("Download PDF with text Only") pdf_summary_only = gr.Button("Download PDF with Summary Only") pdf_both = gr.Button("Download PDF with Both") pdf_output_text_only = gr.File(label="Download PDF") pdf_output_summary_only = gr.File(label="Download PDF") pdf_output_both = gr.File(label="Download PDF") def generate_pdf_text_only(text): return save_to_pdf(text, "") def generate_pdf_summary_only(summary): return save_to_pdf("", summary) def generate_pdf_both(text, summary): return save_to_pdf(text, summary) pdf_text_only.click( fn=generate_pdf_text_only, inputs=[text_output], outputs=[pdf_output_text_only] ) pdf_summary_only.click( fn=generate_pdf_summary_only, inputs=[summary_output], outputs=[pdf_output_summary_only] ) pdf_both.click( fn=generate_pdf_both, inputs=[text_output, summary_output], outputs=[pdf_output_both] ) iface.launch(share=True, debug=True)