Spaces:

camparchimedes
/

nb

Build error

App Files Files

camparchimedes commited on Aug 26, 2024

Commit

1de6e28

verified ·

1 Parent(s): 23c7066

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -16

app.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #---------------------------------------------------------------------------------------------------------------------------------------------
 import gradio as gr
 from PIL import Image
 from pydub import AudioSegment
@@ -40,10 +41,9 @@ import spacy
 import networkx as nx
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-#---------------------------------------------------------------------------------------------------------------------------------------------
 warnings.filterwarnings("ignore")
 HEADER_INFO = """
     # WEB APP ✨| Norwegian WHISPER Model
 Switch Work [Transkribering av lydfiler til norsk skrift]
@@ -54,14 +54,12 @@ SIDEBAR_INFO = f"""
     <img src="{LOGO}" style="width: 100%; height: auto;"/>
 </div>
 """
-# Check if CUDA is available
-if not torch.cuda.is_available():
-    raise RuntimeError("CUDA not available. Go look for a GPU.")
-# Set GPU
 dtype = torch.bfloat32
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def convert_to_wav(filepath):
     _, file_ending = os.path.splitext(f'{filepath}')
     audio_file = filepath.replace(file_ending, ".wav")
@@ -70,6 +68,7 @@ def convert_to_wav(filepath):
 pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, torch_dtype=dtype).to(device)
 def transcribe_audio(audio_file, batch_size=16):
     start_time = time.time()
@@ -100,9 +99,9 @@ def transcribe_audio(audio_file, batch_size=16):
     """
     return text.strip(), system_info
-#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-# Clean/preprocess text
 def clean_text(text):
     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
     text = re.sub(r'[^\w\s]', '', text)
@@ -113,6 +112,7 @@ nlp = spacy.blank("nb")  # 'nb' ==> codename = Norwegian Bokmål
 nlp.add_pipe('sentencizer')
 spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
 def preprocess_text(text):
     # Process the text with SpaCy
     doc = nlp(text)
@@ -123,6 +123,7 @@ def preprocess_text(text):
     return ' '.join(words)
 # Summarize w/T5 model
 def summarize_text(text):
     preprocessed_text = preprocess_text(text)
     inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
@@ -130,7 +131,6 @@ def summarize_text(text):
     summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
     return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-# Builds similarity matrix
 def build_similarity_matrix(sentences, stop_words):
     similarity_matrix = nx.Graph()
     for i, tokens_a in enumerate(sentences):
@@ -142,11 +142,10 @@ def build_similarity_matrix(sentences, stop_words):
 # PageRank
 def graph_based_summary(text, num_paragraphs=3):
     doc = nlp(text)
     sentences = [sent.text for sent in doc.sents]
     if len(sentences) < num_paragraphs:
-        return ' '.join(sentences)  # Adjusted to return a single string
     sentence_tokens = [nlp(sent) for sent in sentences]
     stop_words = spacy_stop_words
@@ -183,7 +182,7 @@ def text_rank_summary(text, num_paragraphs=3):
     doc = nlp(text)
     sentences = [sent.text for sent in doc.sents]
     if len(sentences) < num_paragraphs:
-        return ' '.join(sentences)  # Adjusted to return a single string
     stop_words = spacy_stop_words
     vectorizer = TfidfVectorizer(stop_words=list(stop_words))
@@ -227,7 +226,6 @@ with iface:
             text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
             summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
-            # Displaying AiLab and Bjørn explanations
             gr.Markdown("""
             **token-based**: similarity matrix edge weights representing token overlap/
             ranked by their centrality in the graph (good with dense inter-sentence relationships)
@@ -245,7 +243,6 @@ with iface:
             text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
             summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
-            # Displaying AiLab and Bjørn explanations
             gr.Markdown("""
             **semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
             (good for sparse graph structures with thresholding)
@@ -263,7 +260,6 @@ with iface:
             text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
             summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
-            # Displaying AiLab and Bjørn explanations
             gr.Markdown("""
             **sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
             """)
@@ -288,4 +284,4 @@ with iface:
             pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output])  # Includes all summary outputs
             pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output_graph], outputs=[pdf_output])  # Defaulting to Graph-based summary
- iface.launch(share=True, debug=True)

 # See the License for the specific language governing permissions and
 # limitations under the License.
 #---------------------------------------------------------------------------------------------------------------------------------------------
+import spaces
 import gradio as gr
 from PIL import Image
 from pydub import AudioSegment
 import networkx as nx
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 warnings.filterwarnings("ignore")
+#              ------------header section------------
 HEADER_INFO = """
     # WEB APP ✨| Norwegian WHISPER Model
 Switch Work [Transkribering av lydfiler til norsk skrift]
     <img src="{LOGO}" style="width: 100%; height: auto;"/>
 </div>
 """
+#              ------------transcribe section------------
 dtype = torch.bfloat32
 device = "cuda" if torch.cuda.is_available() else "cpu"
+@spaces.GPU()
 def convert_to_wav(filepath):
     _, file_ending = os.path.splitext(f'{filepath}')
     audio_file = filepath.replace(file_ending, ".wav")
 pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, torch_dtype=dtype).to(device)
+@spaces.GPU()
 def transcribe_audio(audio_file, batch_size=16):
     start_time = time.time()
     """
     return text.strip(), system_info
+#              ------------summary section------------
+@spaces.GPU()
 def clean_text(text):
     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
     text = re.sub(r'[^\w\s]', '', text)
 nlp.add_pipe('sentencizer')
 spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
+@spaces.GPU()
 def preprocess_text(text):
     # Process the text with SpaCy
     doc = nlp(text)
     return ' '.join(words)
 # Summarize w/T5 model
+@spaces.GPU()
 def summarize_text(text):
     preprocessed_text = preprocess_text(text)
     inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
     summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
     return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def build_similarity_matrix(sentences, stop_words):
     similarity_matrix = nx.Graph()
     for i, tokens_a in enumerate(sentences):
 # PageRank
 def graph_based_summary(text, num_paragraphs=3):
     doc = nlp(text)
     sentences = [sent.text for sent in doc.sents]
     if len(sentences) < num_paragraphs:
+        return ' '.join(sentences)
     sentence_tokens = [nlp(sent) for sent in sentences]
     stop_words = spacy_stop_words
     doc = nlp(text)
     sentences = [sent.text for sent in doc.sents]
     if len(sentences) < num_paragraphs:
+        return ' '.join(sentences)
     stop_words = spacy_stop_words
     vectorizer = TfidfVectorizer(stop_words=list(stop_words))
             text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
             summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
             gr.Markdown("""
             **token-based**: similarity matrix edge weights representing token overlap/
             ranked by their centrality in the graph (good with dense inter-sentence relationships)
             text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
             summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
             gr.Markdown("""
             **semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
             (good for sparse graph structures with thresholding)
             text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
             summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
             gr.Markdown("""
             **sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
             """)
             pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output])  # Includes all summary outputs
             pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output_graph], outputs=[pdf_output])  # Defaulting to Graph-based summary
+iface.launch(share=True, debug=True)