Spaces:
Build error
Build error
| # -----------------COPY OF NEW EDITION[app.py]----------------- | |
| # check if still the case...........??********************************************* | |
| # "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results." | |
| import time | |
| import os | |
| import spaces | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| from pydub import AudioSegment | |
| # If m4a audio, convert to wav (Python) | |
| def convert_to_wav(audio_file): | |
| audio = AudioSegment.from_file(audio_file, format="m4a") | |
| wav_file = "temp.wav" | |
| audio.export(wav_file, format="wav") | |
| return wav_file | |
| import torch | |
| from transformers import pipeline, AutoProcessor # AutoModelForSpeechSeq2Seq | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| torch_dtype = torch.float32 | |
| pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype) | |
| # @spaces.GPU(queue=True) | |
| # Initialize processor and pipeline | |
| processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large") | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| torch_dtype = torch.float32 | |
| language = "no" | |
| task = "transcribe" | |
| def transcribe_audio(audio_file): | |
| if audio_file.endswith(".m4a"): | |
| audio_file = convert_to_wav(audio_file) | |
| start_time = time.time() | |
| # forced_decoder_ids@the correct context | |
| forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task) | |
| with torch.no_grad(): | |
| # CUDA@function --> | |
| with torch.cuda.device(device) if torch.cuda.is_available() else contextlib.nullcontext(): | |
| output = pipe(audio_file, chunk_length_s=30, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}) | |
| text = output["text"] | |
| end_time = time.time() | |
| output_time = end_time - start_time | |
| word_count = len(text.split()) | |
| result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {word_count}" | |
| return text, result | |
| # [VERSION 3: full-on w/ 3 styles for summarization] | |
| import nltk | |
| from nltk.tokenize import word_tokenize, sent_tokenize | |
| from nltk.corpus import stopwords | |
| import networkx as nx | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip())) | |
| def clean_text(text): | |
| text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE) | |
| text = re.sub(r'\<a href', ' ', str(text)) | |
| text = re.sub(r'&', '', str(text)) | |
| text = re.sub(r'\(s+', '(', str(text)) | |
| text = re.sub(r's+\)', ')', str(text)) | |
| text = re.sub(r'\(\)', '', str(text)) | |
| text = re.sub(r'\s+', ' ', str(text)) | |
| text = re.sub(r'[_"\-;%|+&=*%!?:#$@\[\]]', ' ', str(text)) | |
| text = re.sub(r'<br />', ' ', str(text)) | |
| text = re.sub(r'\'', '', str(text)) | |
| text = re.sub(r'«', '', str(text)) | |
| text = re.sub(r'»', '', str(text)) | |
| text = re.sub(r'–', '-', str(text)) | |
| text = re.sub(r'…', '.', str(text)) | |
| text = re.sub(r'[^\x00-\x7F]+', ' ', str(text)) | |
| return text | |
| def preprocess_text(text): | |
| try: | |
| words = word_tokenize(text) | |
| stop_words = set(stopwords.words('norwegian')) | |
| words_without_stopwords = [word for word in words if word.lower() not in stop_words] | |
| processed_text = ' '.join(words_without_stopwords) | |
| return processed_text | |
| except Exception as e: | |
| st.error(f"Error during text preprocessing: {e}") | |
| return None | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16) | |
| summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base") | |
| summarization_model.to(device) | |
| # @spaces.GPU(queue=True) | |
| def summarize_text(text): | |
| preprocessed_text = preprocess_text(text) | |
| if preprocessed_text is None: | |
| return None | |
| inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True) | |
| inputs = inputs.to(device) | |
| summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True) | |
| summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| return summary | |
| def build_similarity_matrix(sentences, stop_words): | |
| similarity_matrix = nx.Graph() | |
| for i, tokens_a in enumerate(sentences): | |
| for j, tokens_b in enumerate(sentences): | |
| if i != j: | |
| common_words = set(tokens_a) & set(tokens_b) | |
| similarity_matrix.add_edge(i, j, weight=len(common_words)) | |
| return similarity_matrix | |
| def graph_based_summary(text, num_paragraphs=3): | |
| sentences = text.strip().split(".") | |
| if len(sentences) < num_paragraphs: | |
| return sentences | |
| sentence_tokens = [word_tokenize(sent) for sent in sentences] | |
| stop_words = set(stopwords.words('norwegian')) | |
| filtered_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in sentence_tokens] | |
| similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words) | |
| scores = nx.pagerank(similarity_matrix) | |
| ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True) | |
| summary = [sent for _, sent in ranked_sentences[:num_paragraphs]] | |
| return summary | |
| def lex_rank_summary(text, num_paragraphs=3, threshold=0.1): | |
| sentences = nltk.sent_tokenize(text) | |
| if len(sentences) < num_paragraphs: | |
| return sentences | |
| stop_words = set(stopwords.words('norwegian')) | |
| vectorizer = TfidfVectorizer(stop_words=list(stop_words)) | |
| X = vectorizer.fit_transform(sentences) | |
| similarity_matrix = cosine_similarity(X, X) | |
| for i in range(len(similarity_matrix)): # threshold | |
| for j in range(len(similarity_matrix[i])): | |
| if similarity_matrix[i][j] < threshold: | |
| similarity_matrix[i][j] = 0.0 | |
| nx_graph = nx.from_numpy_array(similarity_matrix) | |
| scores = nx.pagerank(nx_graph) | |
| ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) | |
| summary = [ranked_sentences[i][1] for i in range(num_paragraphs)] | |
| return summary | |
| def text_rank_summary(text, num_paragraphs=3): | |
| sentences = nltk.sent_tokenize(text) | |
| if len(sentences) < num_paragraphs: | |
| return sentences | |
| stop_words = set(stopwords.words('norwegian')) | |
| vectorizer = TfidfVectorizer(stop_words=list(stop_words)) | |
| X = vectorizer.fit_transform(sentences) | |
| similarity_matrix = cosine_similarity(X, X) | |
| nx_graph = nx.from_numpy_array(similarity_matrix) # graph, nodes (i.e sentences) & edges are similarity scores (is cool) | |
| scores = nx.pagerank(nx_graph) # PageRank algorithm, scoring sentences | |
| ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # rank by PageRank scores | |
| summary = [ranked_sentences[i][1] for i in range(num_paragraphs)] # top sentences for summary | |
| return ' '.join(summary) | |
| banner_html = """ | |
| <div style="text-align: center;"> | |
| <img src="https://huggingface.co/spaces/camparchimedes/transcription_app/resolve/main/Olas%20AudioSwitch%20Shop.png" alt="Banner Image" width="100%" height="auto"> | |
| </div> | |
| """ | |
| import gradio as gr | |
| from fpdf import FPDF | |
| from PIL import Image | |
| def save_to_pdf(text, summary): | |
| pdf = FPDF() | |
| pdf.add_page()alt="Banner Image" | |
| pdf.set_font("Arial", size=12) | |
| if text: | |
| pdf.multi_cell(0, 10, "text:\n" + text) | |
| # paragraph space | |
| pdf.ln(10) | |
| if summary: | |
| pdf.multi_cell(0, 10, "Summary:\n" + summary) | |
| pdf_output_path = "transcription.pdf" | |
| pdf.output(pdf_output_path) | |
| return pdf_output_path | |
| iface = gr.Interface( | |
| fn=transcribe_audio, | |
| inputs=gr.Audio(type="filepath"), | |
| outputs=gr.Textbox(label="Transcription"), | |
| title="SW Transcription App", | |
| description="Upload an audio file to get the text", | |
| theme="default", | |
| live=False | |
| ) | |
| iface = gr.Blocks() | |
| with iface: | |
| gr.HTML(banner_html) | |
| gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.") | |
| with gr.Tabs(): | |
| with gr.TabItem("Transcription"): | |
| audio_input = gr.Audio(type="filepath") | |
| text_output = gr.Textbox(label="text") | |
| result_output = gr.Textbox(label="Time taken and Number of words") | |
| transcribe_button = gr.Button("Transcribe") | |
| transcribe_button.click( | |
| fn=transcribe_audio, | |
| inputs=[audio_input], | |
| outputs=[text_output, result_output] | |
| ) | |
| with gr.TabItem("Summary_t1"): | |
| summary_output = gr.Textbox(label="Summary | Graph-based") | |
| summarize_button = gr.Button("Summarize") | |
| def summarize(text): | |
| if not text: | |
| return "Warning: a text must be available." | |
| summary = graph_based_summary(text) | |
| return summary | |
| summarize_button.click( | |
| fn=summarize, | |
| inputs=[text_output], | |
| outputs=summary_output | |
| ) | |
| with gr.TabItem("LexRank"): | |
| summary_output = gr.Textbox(label="Summary | LexRank") | |
| summarize_button = gr.Button("Summarize") | |
| def summarize(text): | |
| if not text: | |
| return "Warning: a text must be available." | |
| summary = lex_rank_summary(text) | |
| return summary | |
| summarize_button.click( | |
| fn=summarize, | |
| inputs=[text_output], | |
| outputs=summary_output | |
| ) | |
| with gr.TabItem("TextRank"): | |
| summary_output = gr.Textbox(label="Summary | TextRank") | |
| summarize_button = gr.Button("Summarize") | |
| def summarize(text): | |
| if not text: | |
| return "Warning: a text must be available." | |
| summary = text_rank_summary(text) | |
| return summary | |
| summarize_button.click( | |
| fn=summarize, | |
| inputs=[text_output], | |
| outputs=summary_output | |
| ) | |
| with gr.TabItem("Download PDF"): | |
| pdf_text_only = gr.Button("Download PDF with text Only") | |
| pdf_summary_only = gr.Button("Download PDF with Summary Only") | |
| pdf_both = gr.Button("Download PDF with Both") | |
| pdf_output_text_only = gr.File(label="Download PDF") | |
| pdf_output_summary_only = gr.File(label="Download PDF") | |
| pdf_output_both = gr.File(label="Download PDF") | |
| def generate_pdf_text_only(text): | |
| return save_to_pdf(text, "") | |
| def generate_pdf_summary_only(summary): | |
| return save_to_pdf("", summary) | |
| def generate_pdf_both(text, summary): | |
| return save_to_pdf(text, summary) | |
| pdf_text_only.click( | |
| fn=generate_pdf_text_only, | |
| inputs=[text_output], | |
| outputs=[pdf_output_text_only] | |
| ) | |
| pdf_summary_only.click( | |
| fn=generate_pdf_summary_only, | |
| inputs=[summary_output], | |
| outputs=[pdf_output_summary_only] | |
| ) | |
| pdf_both.click( | |
| fn=generate_pdf_both, | |
| inputs=[text_output, summary_output], | |
| outputs=[pdf_output_both] | |
| ) | |
| iface.launch(share=True, debug=True) | |