camparchimedes commited on
Commit
1de6e28
·
verified ·
1 Parent(s): 23c7066

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -16
app.py CHANGED
@@ -13,6 +13,7 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #---------------------------------------------------------------------------------------------------------------------------------------------
 
16
  import gradio as gr
17
  from PIL import Image
18
  from pydub import AudioSegment
@@ -40,10 +41,9 @@ import spacy
40
  import networkx as nx
41
  from sklearn.feature_extraction.text import TfidfVectorizer
42
  from sklearn.metrics.pairwise import cosine_similarity
43
- #---------------------------------------------------------------------------------------------------------------------------------------------
44
  warnings.filterwarnings("ignore")
45
 
46
-
47
  HEADER_INFO = """
48
  # WEB APP ✨| Norwegian WHISPER Model
49
  Switch Work [Transkribering av lydfiler til norsk skrift]
@@ -54,14 +54,12 @@ SIDEBAR_INFO = f"""
54
  <img src="{LOGO}" style="width: 100%; height: auto;"/>
55
  </div>
56
  """
57
- # Check if CUDA is available
58
- if not torch.cuda.is_available():
59
- raise RuntimeError("CUDA not available. Go look for a GPU.")
60
 
61
- # Set GPU
62
  dtype = torch.bfloat32
63
  device = "cuda" if torch.cuda.is_available() else "cpu"
64
 
 
65
  def convert_to_wav(filepath):
66
  _, file_ending = os.path.splitext(f'{filepath}')
67
  audio_file = filepath.replace(file_ending, ".wav")
@@ -70,6 +68,7 @@ def convert_to_wav(filepath):
70
 
71
  pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, torch_dtype=dtype).to(device)
72
 
 
73
  def transcribe_audio(audio_file, batch_size=16):
74
  start_time = time.time()
75
 
@@ -100,9 +99,9 @@ def transcribe_audio(audio_file, batch_size=16):
100
  """
101
 
102
  return text.strip(), system_info
103
- #:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
104
 
105
- # Clean/preprocess text
 
106
  def clean_text(text):
107
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
108
  text = re.sub(r'[^\w\s]', '', text)
@@ -113,6 +112,7 @@ nlp = spacy.blank("nb") # 'nb' ==> codename = Norwegian Bokmål
113
  nlp.add_pipe('sentencizer')
114
  spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
115
 
 
116
  def preprocess_text(text):
117
  # Process the text with SpaCy
118
  doc = nlp(text)
@@ -123,6 +123,7 @@ def preprocess_text(text):
123
  return ' '.join(words)
124
 
125
  # Summarize w/T5 model
 
126
  def summarize_text(text):
127
  preprocessed_text = preprocess_text(text)
128
  inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
@@ -130,7 +131,6 @@ def summarize_text(text):
130
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
131
  return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
132
 
133
- # Builds similarity matrix
134
  def build_similarity_matrix(sentences, stop_words):
135
  similarity_matrix = nx.Graph()
136
  for i, tokens_a in enumerate(sentences):
@@ -142,11 +142,10 @@ def build_similarity_matrix(sentences, stop_words):
142
 
143
  # PageRank
144
  def graph_based_summary(text, num_paragraphs=3):
145
-
146
  doc = nlp(text)
147
  sentences = [sent.text for sent in doc.sents]
148
  if len(sentences) < num_paragraphs:
149
- return ' '.join(sentences) # Adjusted to return a single string
150
 
151
  sentence_tokens = [nlp(sent) for sent in sentences]
152
  stop_words = spacy_stop_words
@@ -183,7 +182,7 @@ def text_rank_summary(text, num_paragraphs=3):
183
  doc = nlp(text)
184
  sentences = [sent.text for sent in doc.sents]
185
  if len(sentences) < num_paragraphs:
186
- return ' '.join(sentences) # Adjusted to return a single string
187
 
188
  stop_words = spacy_stop_words
189
  vectorizer = TfidfVectorizer(stop_words=list(stop_words))
@@ -227,7 +226,6 @@ with iface:
227
  text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
228
  summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
229
 
230
- # Displaying AiLab and Bjørn explanations
231
  gr.Markdown("""
232
  **token-based**: similarity matrix edge weights representing token overlap/
233
  ranked by their centrality in the graph (good with dense inter-sentence relationships)
@@ -245,7 +243,6 @@ with iface:
245
  text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
246
  summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
247
 
248
- # Displaying AiLab and Bjørn explanations
249
  gr.Markdown("""
250
  **semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
251
  (good for sparse graph structures with thresholding)
@@ -263,7 +260,6 @@ with iface:
263
  text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
264
  summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
265
 
266
- # Displaying AiLab and Bjørn explanations
267
  gr.Markdown("""
268
  **sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
269
  """)
@@ -288,4 +284,4 @@ with iface:
288
  pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output]) # Includes all summary outputs
289
  pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output_graph], outputs=[pdf_output]) # Defaulting to Graph-based summary
290
 
291
- iface.launch(share=True, debug=True)
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #---------------------------------------------------------------------------------------------------------------------------------------------
16
+ import spaces
17
  import gradio as gr
18
  from PIL import Image
19
  from pydub import AudioSegment
 
41
  import networkx as nx
42
  from sklearn.feature_extraction.text import TfidfVectorizer
43
  from sklearn.metrics.pairwise import cosine_similarity
 
44
  warnings.filterwarnings("ignore")
45
 
46
+ # ------------header section------------
47
  HEADER_INFO = """
48
  # WEB APP ✨| Norwegian WHISPER Model
49
  Switch Work [Transkribering av lydfiler til norsk skrift]
 
54
  <img src="{LOGO}" style="width: 100%; height: auto;"/>
55
  </div>
56
  """
 
 
 
57
 
58
+ # ------------transcribe section------------
59
  dtype = torch.bfloat32
60
  device = "cuda" if torch.cuda.is_available() else "cpu"
61
 
62
+ @spaces.GPU()
63
  def convert_to_wav(filepath):
64
  _, file_ending = os.path.splitext(f'{filepath}')
65
  audio_file = filepath.replace(file_ending, ".wav")
 
68
 
69
  pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, torch_dtype=dtype).to(device)
70
 
71
+ @spaces.GPU()
72
  def transcribe_audio(audio_file, batch_size=16):
73
  start_time = time.time()
74
 
 
99
  """
100
 
101
  return text.strip(), system_info
 
102
 
103
+ # ------------summary section------------
104
+ @spaces.GPU()
105
  def clean_text(text):
106
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
107
  text = re.sub(r'[^\w\s]', '', text)
 
112
  nlp.add_pipe('sentencizer')
113
  spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
114
 
115
+ @spaces.GPU()
116
  def preprocess_text(text):
117
  # Process the text with SpaCy
118
  doc = nlp(text)
 
123
  return ' '.join(words)
124
 
125
  # Summarize w/T5 model
126
+ @spaces.GPU()
127
  def summarize_text(text):
128
  preprocessed_text = preprocess_text(text)
129
  inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
 
131
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
132
  return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
133
 
 
134
  def build_similarity_matrix(sentences, stop_words):
135
  similarity_matrix = nx.Graph()
136
  for i, tokens_a in enumerate(sentences):
 
142
 
143
  # PageRank
144
  def graph_based_summary(text, num_paragraphs=3):
 
145
  doc = nlp(text)
146
  sentences = [sent.text for sent in doc.sents]
147
  if len(sentences) < num_paragraphs:
148
+ return ' '.join(sentences)
149
 
150
  sentence_tokens = [nlp(sent) for sent in sentences]
151
  stop_words = spacy_stop_words
 
182
  doc = nlp(text)
183
  sentences = [sent.text for sent in doc.sents]
184
  if len(sentences) < num_paragraphs:
185
+ return ' '.join(sentences)
186
 
187
  stop_words = spacy_stop_words
188
  vectorizer = TfidfVectorizer(stop_words=list(stop_words))
 
226
  text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
227
  summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
228
 
 
229
  gr.Markdown("""
230
  **token-based**: similarity matrix edge weights representing token overlap/
231
  ranked by their centrality in the graph (good with dense inter-sentence relationships)
 
243
  text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
244
  summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
245
 
 
246
  gr.Markdown("""
247
  **semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
248
  (good for sparse graph structures with thresholding)
 
260
  text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
261
  summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
262
 
 
263
  gr.Markdown("""
264
  **sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
265
  """)
 
284
  pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output]) # Includes all summary outputs
285
  pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output_graph], outputs=[pdf_output]) # Defaulting to Graph-based summary
286
 
287
+ iface.launch(share=True, debug=True)