camparchimedes commited on
Commit
7ef26c1
·
verified ·
1 Parent(s): 4012d3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -14
app.py CHANGED
@@ -22,6 +22,7 @@ import warnings
22
  from pydub import AudioSegment
23
  import torch
24
  import torchaudio
 
25
  from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
26
  from huggingface_hub import model_info
27
  import spacy
@@ -75,9 +76,15 @@ def transcribe_audio(audio_file, chunk_length_s=30):
75
 
76
  start_time = time.time()
77
 
78
- # Load the audio waveform using torchaudio
79
  waveform, sample_rate = torchaudio.load(audio_file)
80
 
 
 
 
 
 
 
81
  # Calculate the number of chunks
82
  chunk_size = chunk_length_s * sample_rate
83
  num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
@@ -107,12 +114,12 @@ def transcribe_audio(audio_file, chunk_length_s=30):
107
  chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
108
  full_text.append(chunk_text)
109
 
110
- # Combine the transcribed text from all chunks
111
  text = " ".join(full_text)
112
 
113
  output_time = time.time() - start_time
114
 
115
- # Audio duration (in seconds)
116
  audio_duration = waveform.shape[1] / sample_rate
117
 
118
  # Real-time Factor (RTF)
@@ -132,26 +139,26 @@ def transcribe_audio(audio_file, chunk_length_s=30):
132
  return text, result
133
 
134
 
135
- # Clean and preprocess text for summarization
136
  def clean_text(text):
137
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
138
  text = re.sub(r'[^\w\s]', '', text)
139
  text = re.sub(r'\s+', ' ', text).strip()
140
  return text
141
 
142
- nlp = spacy.blank("nb") # 'nb' is code for Norwegian Bokmål
143
  spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
144
 
145
  def preprocess_text(text):
146
  # Process the text with SpaCy
147
  doc = nlp(text)
148
- # Use SpaCy's stop words directly
149
  stop_words = spacy_stop_words
150
  # Filter out stop words
151
  words = [token.text for token in doc if token.text.lower() not in stop_words]
152
  return ' '.join(words)
153
 
154
- # Summarize text using the T5 model
155
  def summarize_text(text):
156
  preprocessed_text = preprocess_text(text)
157
  inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
@@ -159,7 +166,7 @@ def summarize_text(text):
159
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
160
  return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
161
 
162
- # Build similarity matrix for graph-based summary
163
  def build_similarity_matrix(sentences, stop_words):
164
  similarity_matrix = nx.Graph()
165
  for i, tokens_a in enumerate(sentences):
@@ -169,7 +176,7 @@ def build_similarity_matrix(sentences, stop_words):
169
  similarity_matrix.add_edge(i, j, weight=len(common_words))
170
  return similarity_matrix
171
 
172
- # Graph-based summarization
173
  def graph_based_summary(text, num_paragraphs=3):
174
  doc = nlp(text)
175
  sentences = [sent.text for sent in doc.sents]
@@ -185,7 +192,7 @@ def graph_based_summary(text, num_paragraphs=3):
185
  ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
186
  return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
187
 
188
- # LexRank summarization
189
  def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
190
  doc = nlp(text)
191
  sentences = [sent.text for sent in doc.sents]
@@ -197,14 +204,14 @@ def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
197
  X = vectorizer.fit_transform(sentences)
198
  similarity_matrix = cosine_similarity(X, X)
199
 
200
- # Apply threshold to the similarity matrix
201
  similarity_matrix[similarity_matrix < threshold] = 0
202
  nx_graph = nx.from_numpy_array(similarity_matrix)
203
  scores = nx.pagerank(nx_graph)
204
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
205
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
206
 
207
- # TextRank summarization
208
  def text_rank_summary(text, num_paragraphs=3):
209
  doc = nlp(text)
210
  sentences = [sent.text for sent in doc.sents]
@@ -222,7 +229,7 @@ def text_rank_summary(text, num_paragraphs=3):
222
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
223
 
224
 
225
- # Save text and summary to PDF
226
  def save_to_pdf(text, summary):
227
  pdf = FPDF()
228
  pdf.add_page()
@@ -243,7 +250,7 @@ def save_to_pdf(text, summary):
243
  iface = gr.Blocks()
244
 
245
  with iface:
246
- gr.Image("https://huggingface.co/spaces/camparchimedes/transcription_app/blob/main/banner_trans.png")
247
  gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
248
 
249
  with gr.Tabs():
 
22
  from pydub import AudioSegment
23
  import torch
24
  import torchaudio
25
+ import torchaudio.transforms as transforms
26
  from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
27
  from huggingface_hub import model_info
28
  import spacy
 
76
 
77
  start_time = time.time()
78
 
79
+ # Load waveform/torchaudio
80
  waveform, sample_rate = torchaudio.load(audio_file)
81
 
82
+ # Resample to 16000 Hz if 32K
83
+ if sample_rate != 16000:
84
+ resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
85
+ waveform = resampler(waveform)
86
+ sample_rate = 16000
87
+
88
  # Calculate the number of chunks
89
  chunk_size = chunk_length_s * sample_rate
90
  num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
 
114
  chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
115
  full_text.append(chunk_text)
116
 
117
+ # Combine transcribed text/all chunks
118
  text = " ".join(full_text)
119
 
120
  output_time = time.time() - start_time
121
 
122
+ # Dduration (secs)
123
  audio_duration = waveform.shape[1] / sample_rate
124
 
125
  # Real-time Factor (RTF)
 
139
  return text, result
140
 
141
 
142
+ # Clean and preprocess/@summarization
143
  def clean_text(text):
144
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
145
  text = re.sub(r'[^\w\s]', '', text)
146
  text = re.sub(r'\s+', ' ', text).strip()
147
  return text
148
 
149
+ nlp = spacy.blank("nb") # 'nb' ==> codename = Norwegian Bokmål
150
  spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
151
 
152
  def preprocess_text(text):
153
  # Process the text with SpaCy
154
  doc = nlp(text)
155
+ # SpaCy's stop top wrds direct
156
  stop_words = spacy_stop_words
157
  # Filter out stop words
158
  words = [token.text for token in doc if token.text.lower() not in stop_words]
159
  return ' '.join(words)
160
 
161
+ # Summarize w/T5 model
162
  def summarize_text(text):
163
  preprocessed_text = preprocess_text(text)
164
  inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
 
166
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
167
  return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
168
 
169
+ # Builds similarity matrix
170
  def build_similarity_matrix(sentences, stop_words):
171
  similarity_matrix = nx.Graph()
172
  for i, tokens_a in enumerate(sentences):
 
176
  similarity_matrix.add_edge(i, j, weight=len(common_words))
177
  return similarity_matrix
178
 
179
+ # "Graph-based summarization" =====>
180
  def graph_based_summary(text, num_paragraphs=3):
181
  doc = nlp(text)
182
  sentences = [sent.text for sent in doc.sents]
 
192
  ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
193
  return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
194
 
195
+ # LexRank
196
  def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
197
  doc = nlp(text)
198
  sentences = [sent.text for sent in doc.sents]
 
204
  X = vectorizer.fit_transform(sentences)
205
  similarity_matrix = cosine_similarity(X, X)
206
 
207
+ # Apply threshold@similarity matrix
208
  similarity_matrix[similarity_matrix < threshold] = 0
209
  nx_graph = nx.from_numpy_array(similarity_matrix)
210
  scores = nx.pagerank(nx_graph)
211
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
212
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
213
 
214
+ # TextRank
215
  def text_rank_summary(text, num_paragraphs=3):
216
  doc = nlp(text)
217
  sentences = [sent.text for sent in doc.sents]
 
229
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
230
 
231
 
232
+ # Save text+summary/PDF
233
  def save_to_pdf(text, summary):
234
  pdf = FPDF()
235
  pdf.add_page()
 
250
  iface = gr.Blocks()
251
 
252
  with iface:
253
+ gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/blob/main/banner_trans.png" alt="Banner Image" />')
254
  gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
255
 
256
  with gr.Tabs():