Spaces:

camparchimedes
/

nb

Build error

App Files Files

camparchimedes commited on Aug 11, 2024

Commit

9e722fb

verified ·

1 Parent(s): 64e12f4

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -62

app.py CHANGED Viewed

@@ -1,49 +1,39 @@
-import networkx as nx
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-import nltk
-nltk.download('punkt')
-from nltk.tokenize import sent_tokenize
-import gradio as gr
-import warnings
-import torch
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
-from pydub import AudioSegment
-from fpdf import FPDF
-from PIL import Image
 import time
 import os
-# import spaces
 warnings.filterwarnings("ignore")
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-torch_dtype = torch.float32
-# ASR pipeline
-asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch_dtype)
-# Switch m4a to wav
 def convert_to_wav(audio_file):
     audio = AudioSegment.from_file(audio_file, format="m4a")
     wav_file = "temp.wav"
     audio.export(wav_file, format="wav")
     return wav_file
-# @spaces.GPU(queue=True)
-# Transcription funct.@ASR
 def transcribe_audio(audio_file):
     if audio_file.endswith(".m4a"):
         audio_file = convert_to_wav(audio_file)
     start_time = time.time()
     with torch.no_grad():
-        output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"})
     transcription = output["text"]
     end_time = time.time()
@@ -51,44 +41,141 @@ def transcribe_audio(audio_file):
     output_time = end_time - start_time
     word_count = len(transcription.split())
-    result = f"Transcription: {transcription.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
     return transcription.strip(), result
-# t5-base model@summary funct.
 summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
-summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-# t5-base to device
 summarization_model.to(device)
-# Graph-based summarization (TextRank, method)
 def summarize_text(text):
-    sentences = sent_tokenize(text)
-    if len(sentences) == 0:
-        return ""
-    tfidf_vectorizer = TfidfVectorizer()
-    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
-    similarity_matrix = cosine_similarity(tfidf_matrix)
     nx_graph = nx.from_numpy_array(similarity_matrix)
-    scores = nx.pagerank(nx_graph)
     ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
-    # Select top N sentences (e.g., 3 sentences for the summary)
-    top_n = 3
-    summary = " ".join([s for _, s in ranked_sentences[:top_n]])
     return summary
-# HTML syntax for imagery
-banner_html = """
-<div style="text-align: center;">
-    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
-</div>
-"""
-# Transcription and summary@PDF option(s)
 def save_to_pdf(transcription, summary):
     pdf = FPDF()
     pdf.add_page()
@@ -103,28 +190,35 @@ def save_to_pdf(transcription, summary):
     if summary:
         pdf.multi_cell(0, 10, "Summary:\n" + summary)
-    pdf_output_path = "transcription_summary.pdf"
     pdf.output(pdf_output_path)
     return pdf_output_path
-# Gradio
 iface = gr.Interface(
     fn=transcribe_audio,
     inputs=gr.Audio(type="filepath"),
     outputs="text",
-    title="Audio Transcription App",
     description="Upload an audio file to get the transcription",
     theme="default",
     live=False
 )
 iface = gr.Blocks()
 with iface:
     gr.HTML(banner_html)
-    gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
     with gr.Tabs():
-        # Transcription Tab
         with gr.TabItem("Transcription"):
             audio_input = gr.Audio(type="filepath")
             transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
@@ -134,22 +228,21 @@ with iface:
             def transcribe(audio_file):
                 transcription, result = transcribe_audio(audio_file)
                 return transcription, result
             transcribe_button.click(
                 fn=transcribe,
                 inputs=[audio_input],
                 outputs=[transcription_output, result_output]
             )
-        # Summary Tab
-        with gr.TabItem("Summary"):
-            summary_output = gr.Textbox(label="Summary | TextRank, graph-based")
             summarize_button = gr.Button("Summarize")
             def summarize(transcription):
                 if not transcription:
                     return "Warning: a transcription must be available."
-                summary = summarize_text(transcription)
                 return summary
             summarize_button.click(
@@ -158,7 +251,40 @@ with iface:
                 outputs=summary_output
             )
-        # PDF Download Tab
         with gr.TabItem("Download PDF"):
             pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
             pdf_summary_only = gr.Button("Download PDF with Summary Only")
@@ -195,6 +321,4 @@ with iface:
                 outputs=[pdf_output_both]
             )
-# run
-iface.launch(share=True, debug=True)

 import time
 import os
+import warnings
 warnings.filterwarnings("ignore")
+from pydub import AudioSegment
+# If m4a audio, convert to wav (Python)
 def convert_to_wav(audio_file):
     audio = AudioSegment.from_file(audio_file, format="m4a")
     wav_file = "temp.wav"
     audio.export(wav_file, format="wav")
     return wav_file
+import torch
+from transformers import pipeline # AutoProcessor, AutoModelForSpeechSeq2Seq
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+torch_dtype = torch.float32
+# ASR pipeline
+asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype)
+# ASR
 def transcribe_audio(audio_file):
     if audio_file.endswith(".m4a"):
         audio_file = convert_to_wav(audio_file)
     start_time = time.time()
     with torch.no_grad():
+        output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8}) # "task": "transcribe", "language": "no" + ..batch_size=10 ?
     transcription = output["text"]
     end_time = time.time()
     output_time = end_time - start_time
     word_count = len(transcription.split())
+    result = f"Time taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
     return transcription.strip(), result
+# -------------------------------------[VERSION 3: full-on t5-base + NLTK + 3 styles for summarization]-------------------------------------
+import nltk
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.corpus import stopwords
+import networkx as nx
+from sklearn.feature_extraction.text import TfidfVectorizer # from before
+from sklearn.metrics.pairwise import cosine_similarity # from before
+import pandas as pd
+import numpy as np
+from rouge_score import rouge_scorer
+nltk.download('punkt') # from before
+nltk.download('stopwords')
+WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
+# Clean text using Norwegian-specific replacements
+def clean_text(text):
+    text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
+    text = re.sub(r'\<a href', ' ', str(text))
+    text = re.sub(r'&amp;', '', str(text))
+    text = re.sub(r'\(s+', '(', str(text))
+    text = re.sub(r's+\)', ')', str(text))
+    text = re.sub(r'\(\)', '', str(text))
+    text = re.sub(r'\s+', ' ', str(text))
+    text = re.sub(r'[_"\-;%|+&=*%!?:#$@\[\]]', ' ', str(text))
+    text = re.sub(r'<br />', ' ', str(text))
+    text = re.sub(r'\'', '', str(text))
+    text = re.sub(r'«', '', str(text))
+    text = re.sub(r'»', '', str(text))
+    text = re.sub(r'–', '-', str(text))
+    text = re.sub(r'…', '.', str(text))
+    text = re.sub(r'[^\x00-\x7F]+', ' ', str(text))
+    return text
+def preprocess_text(text):
+    try:
+        words = word_tokenize(text)
+        stop_words = set(stopwords.words('norwegian'))
+        words_without_stopwords = [word for word in words if word.lower() not in stop_words]
+        processed_text = ' '.join(words_without_stopwords)
+        return processed_text
+    except Exception as e:
+        st.error(f"Error during text preprocessing: {e}")
+        return None
+# ROUGE metric
+scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16)
 summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
 summarization_model.to(device)
 def summarize_text(text):
+    preprocessed_text = preprocess_text(text)
+    if preprocessed_text is None:
+        return None
+    inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
+    inputs = inputs.to(device)
+    summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
+    summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    return summary
+def build_similarity_matrix(sentences, stop_words):
+    similarity_matrix = nx.Graph()
+    for i, tokens_a in enumerate(sentences):
+        for j, tokens_b in enumerate(sentences):
+            if i != j:
+                common_words = set(tokens_a) & set(tokens_b)
+                similarity_matrix.add_edge(i, j, weight=len(common_words))
+    return similarity_matrix
+def graph_based_summary(text, num_paragraphs=3):
+    sentences = text.strip().split(".")
+    if len(sentences) < num_paragraphs:
+        return sentences
+    sentence_tokens = [word_tokenize(sent) for sent in sentences]
+    stop_words = set(stopwords.words('norwegian'))
+    filtered_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in sentence_tokens]
+    similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)
+    scores = nx.pagerank(similarity_matrix)
+    ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
+    summary = [sent for _, sent in ranked_sentences[:num_paragraphs]]
+    return summary
+def lex_rank_summary(text, num_paragraphs=3):
+    sentences = nltk.sent_tokenize(text)
+    if len(sentences) < num_paragraphs:
+        return sentences
+    stop_words = set(stopwords.words('norwegian'))
+    vectorizer = TfidfVectorizer(stop_words=list(stop_words))
+    X = vectorizer.fit_transform(sentences)
+    similarity_matrix = cosine_similarity(X, X)
     nx_graph = nx.from_numpy_array(similarity_matrix)
+    scores = nx.pagerank_numpy(nx_graph)
+    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
+    summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
+    return summary
+def text_rank_summary(text, num_paragraphs=3):
+    sentences = nltk.sent_tokenize(text)
+    if len(sentences) < num_paragraphs:
+        return sentences
+    stop_words = set(stopwords.words('norwegian'))
+    vectorizer = TfidfVectorizer(stop_words=list(stop_words))
+    X = vectorizer.fit_transform(sentences)
+    similarity_matrix = cosine_similarity(X, X)
+    nx_graph = nx.from_numpy_array(similarity_matrix)
+    scores = nx.pagerank(nx_graph)
     ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
+    summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
     return summary
+import gradio as gr
+from fpdf import FPDF
+from PIL import Image
 def save_to_pdf(transcription, summary):
     pdf = FPDF()
     pdf.add_page()
     if summary:
         pdf.multi_cell(0, 10, "Summary:\n" + summary)
+    pdf_output_path = "transcription.pdf"
     pdf.output(pdf_output_path)
     return pdf_output_path
+banner_html = """
+<div style="text-align: center;">
+    <img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/picture.png" alt="Banner" width="100%" height="auto">
+</div>
+"""
 iface = gr.Interface(
     fn=transcribe_audio,
     inputs=gr.Audio(type="filepath"),
     outputs="text",
+    title="SW Transcription App",
     description="Upload an audio file to get the transcription",
     theme="default",
     live=False
 )
 iface = gr.Blocks()
 with iface:
     gr.HTML(banner_html)
+    gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift :D")
     with gr.Tabs():
         with gr.TabItem("Transcription"):
             audio_input = gr.Audio(type="filepath")
             transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
             def transcribe(audio_file):
                 transcription, result = transcribe_audio(audio_file)
                 return transcription, result
             transcribe_button.click(
                 fn=transcribe,
                 inputs=[audio_input],
                 outputs=[transcription_output, result_output]
             )
+        with gr.TabItem("Summary_t1"):
+            summary_output = gr.Textbox(label="Summary | Graph-based")
             summarize_button = gr.Button("Summarize")
             def summarize(transcription):
                 if not transcription:
                     return "Warning: a transcription must be available."
+                summary = graph_based_summary(transcription)
                 return summary
             summarize_button.click(
                 outputs=summary_output
             )
+        with gr.TabItem("Summary_t2"):
+            summary_output = gr.Textbox(label="Summary | LexRank")
+            summarize_button = gr.Button("Summarize")
+            def summarize(transcription):
+                if not transcription:
+                    return "Warning: a transcription must be available."
+                summary = lex_rank_summary(transcription)
+                return summary
+            summarize_button.click(
+                fn=summarize,
+                inputs=[transcription_output],
+                outputs=summary_output
+            )
+        with gr.TabItem("Summary_t3"):
+            summary_output = gr.Textbox(label="Summary | TextRank")
+            summarize_button = gr.Button("Summarize")
+            def summarize(transcription):
+                if not transcription:
+                    return "Warning: a transcription must be available."
+                summary = text_rank_summary(transcription)
+                return summary
+            summarize_button.click(
+                fn=summarize,
+                inputs=[transcription_output],
+                outputs=summary_output
+            )
         with gr.TabItem("Download PDF"):
             pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
             pdf_summary_only = gr.Button("Download PDF with Summary Only")
                 outputs=[pdf_output_both]
             )
+iface.launch(share=True, debug=True)