camparchimedes commited on
Commit
ef7a878
·
verified ·
1 Parent(s): 799fb4c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -163
app.py CHANGED
@@ -1,8 +1,7 @@
1
- """
2
- Version: 5th_pruned_optimized_transcription_app.py (alias HF_modded_nb-whisper_T4)
3
 
4
- Description: webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummering, pdf-download.
5
- """
 
6
 
7
  # Licensed under the Apache License, Version 2.0 (the "License");
8
  # you may not use this file except in compliance with the License.
@@ -16,6 +15,7 @@ Description: webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummer
16
  # See the License for the specific language governing permissions and
17
  # limitations under the License.
18
 
 
19
  import time
20
  import os
21
  import re
@@ -44,80 +44,8 @@ from fpdf import FPDF
44
  from PIL import Image
45
  # from huggingface_hub import model_info
46
 
47
- #############################################################################################################################################3
48
  # Suppress warnings
49
  warnings.filterwarnings("ignore")
50
- """
51
- def generate(
52
- self,
53
- input_features: Optional[torch.Tensor] = None, # <====================== ACTIVE
54
- generation_config: Optional[GenerationConfig] = None, # <====================== could be ACTIVE(ed.)*
55
- logits_processor: Optional[LogitsProcessorList] = None,
56
- stopping_criteria: Optional[StoppingCriteriaList] = None,
57
- prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
58
- synced_gpus: bool = False,
59
- return_timestamps: Optional[bool] = None,
60
- task: Optional[str] = None,
61
- language: Optional[Union[str, List[str]]] = None, # <====================== ACTIVE
62
- is_multilingual: Optional[bool] = None,
63
- prompt_ids: Optional[torch.Tensor] = None,
64
- prompt_condition_type: Optional[str] = None, # first-segment, all-segments
65
- condition_on_prev_tokens: Optional[bool] = None,
66
- temperature: Optional[Union[float, Tuple[float, ...]]] = None,
67
- compression_ratio_threshold: Optional[float] = None,
68
- logprob_threshold: Optional[float] = None,
69
- no_speech_threshold: Optional[float] = None,
70
- num_segment_frames: Optional[int] = None,
71
- attention_mask: Optional[torch.Tensor] = None, # <====================== NOT ACTIVE by DEFAULT
72
- time_precision: float = 0.02,
73
- return_token_timestamps: Optional[bool] = None,
74
- return_segments: bool = False,
75
- return_dict_in_generate: Optional[bool] = None,
76
- **kwargs, # <====================== ACTIVE
77
- ):
78
- *generation_config (`~generation.GenerationConfig`, *optional*):
79
- The generation configuration to be used as base parametrization for the generation call. `**kwargs`
80
- passed to generate matching the attributes of `generation_config` will override them. If
81
- `generation_config` is not provided, the default will be used, which had the following loading
82
- priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
83
- configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
84
- default values, whose documentation should be checked to parameterize generation.
85
-
86
- from v4.39 the forced decoder ids are always None in favour of decoder input ids
87
- generation_config.forced_decoder_ids = None
88
-
89
- Example:
90
-
91
- - *Longform transcription*: To transcribe or translate audios longer than 30 seconds, process the audio files without truncation and pass all mel features at once to generate.
92
-
93
- ```python
94
- >>> import torch
95
- >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
96
- >>> from datasets import load_dataset, Audio
97
-
98
- >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
99
- >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
100
- >>> model.cuda() # doctest: +IGNORE_RESULT
101
-
102
- >>> # load audios > 30 seconds
103
- >>> ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
104
- >>> # resample to 16kHz
105
- >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000))
106
- >>> # take first 8 audios and retrieve array
107
- >>> audio = ds[:8]["audio"]
108
- >>> audio = [x["array"] for x in audio]
109
-
110
- >>> # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
111
- >>> inputs = processor(audio, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True, sampling_rate=16_000)
112
- >>> inputs = inputs.to("cuda", torch.float32)
113
-
114
- >>> # transcribe audio to ids
115
- >>> generated_ids = model.generate(**inputs)
116
-
117
- >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
118
- >>> transcription[0]
119
- " Folks, if you watch the show, you know, I spent a lot of time (..)"
120
- """
121
 
122
  # Convert m4a audio to wav format
123
  def convert_to_wav(audio_file):
@@ -125,36 +53,11 @@ def convert_to_wav(audio_file):
125
  wav_file = "temp.wav"
126
  audio.export(wav_file, format="wav")
127
  return wav_file
128
- #############################################################################################################################################3
129
- #
130
- #
131
- #
132
- #
133
  #---------------------------------------------------------------------------------------------------------------------------------------------
134
  processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
135
  model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
136
  model.cuda() # device = 0 if torch.cuda.is_available() else "cpu"
137
-
138
- """
139
- # 0. deprecate old inputs
140
- if "inputs" in kwargs:
141
- input_features = kwargs.pop("inputs")
142
- warnings.warn(
143
- "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.",
144
- FutureWarning,
145
- )
146
-
147
- # 1. prepare generation config
148
- generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
149
-
150
- # 2. set global generate variables
151
- #input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
152
- #num_segment_frames = input_stride * self.config.max_source_positions
153
- #batch_size, total_input_frames = self._retrieve_total_input_frames(
154
- input_features=input_features, kwargs=kwargs #input_stride=input_stride,
155
- )
156
- """
157
-
158
  generate_kwargs = {
159
  "num_beams": 5,
160
  "language": "no",
@@ -163,20 +66,14 @@ generate_kwargs = {
163
  }
164
 
165
  def transcribe_audio(audio_file, chunk_length_s=30):
166
- #---------------------------------------------------------------------------------------------------------------------------------------------
167
- #
168
- #
169
- #
170
- #
171
- #############################################################################################################################################3
172
  if audio_file.endswith(".m4a"):
173
  audio_file = convert_to_wav(audio_file)
174
 
175
  start_time = time.time()
176
- # Load waveform using torchaudio
177
  waveform, sample_rate = torchaudio.load(audio_file)
178
 
179
- # Convert to mono if the audio has more than one channel
180
  if waveform.shape[0] > 1:
181
  waveform = torch.mean(waveform, dim=0, keepdim=True)
182
 
@@ -185,11 +82,11 @@ def transcribe_audio(audio_file, chunk_length_s=30):
185
  waveform = resampler(waveform)
186
  sample_rate = 16000
187
 
188
- # Calculate the number of chunks
189
  chunk_size = chunk_length_s * sample_rate
190
  num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
191
 
192
- # Initialize empty list@store transcribed text from ea.chunk
193
  full_text = []
194
 
195
  for i in range(num_chunks):
@@ -197,16 +94,10 @@ def transcribe_audio(audio_file, chunk_length_s=30):
197
  end = min((i + 1) * chunk_size, waveform.shape[1])
198
  chunk_waveform = waveform[:, start:end]
199
 
200
- # Check chunk waveform is properly shaped
201
  if chunk_waveform.shape[0] > 1:
202
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
203
- #############################################################################################################################################3
204
- #
205
- #
206
- #
207
- #
208
  #---------------------------------------------------------------------------------------------------------------------------------------------
209
-
210
  # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
211
  inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True)
212
  inputs = inputs.to("cuda", torch.float32)
@@ -218,18 +109,12 @@ def transcribe_audio(audio_file, chunk_length_s=30):
218
  # transcription
219
  chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
220
  #---------------------------------------------------------------------------------------------------------------------------------------------
221
- #
222
- #
223
- #
224
- #
225
- #############################################################################################################################################3
226
  full_text.append(chunk_text)
227
- # Combine the transcribed text from all chunks
228
  text = " ".join(full_text)
229
 
230
  output_time = time.time() - start_time
231
 
232
- # Audio duration (in seconds)
233
  audio_duration = waveform.shape[1] / sample_rate
234
  # Real-time Factor (RTF)
235
  rtf = output_time / audio_duration
@@ -244,45 +129,11 @@ def transcribe_audio(audio_file, chunk_length_s=30):
244
  "It is the ratio of transcription time to the duration of the audio.\n\n"
245
  "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
246
  )
247
- #############################################################################################################################################3
248
- #
249
- #
250
- #
251
- #
252
- #---------------------------------------------------------------------------------------------------------------------------------------------
253
 
254
  return text, result
255
  #---------------------------------------------------------------------------------------------------------------------------------------------
256
- #
257
- #
258
- #
259
- #
260
- #
261
- #
262
- #
263
- #
264
- #
265
- #
266
- #
267
- #
268
- #
269
- #
270
- #
271
- #
272
- #
273
- #
274
- #
275
- #
276
- #
277
- #
278
- #
279
- #
280
- #
281
- #
282
- #
283
- #
284
- #
285
- # Clean and preprocess/@summarization
286
  def clean_text(text):
287
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
288
  text = re.sub(r'[^\w\s]', '', text)
@@ -308,7 +159,7 @@ def summarize_text(text):
308
  inputs = inputs.to(device)
309
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
310
  return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
311
- requires updating the pre-trained model weights to match
312
  # Builds similarity matrix
313
  def build_similarity_matrix(sentences, stop_words):
314
  similarity_matrix = nx.Graph()
 
 
 
1
 
2
+ # app.py
3
+ # Version: 1.06 (08.24.24)
4
+
5
 
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
  # you may not use this file except in compliance with the License.
 
15
  # See the License for the specific language governing permissions and
16
  # limitations under the License.
17
 
18
+
19
  import time
20
  import os
21
  import re
 
44
  from PIL import Image
45
  # from huggingface_hub import model_info
46
 
 
47
  # Suppress warnings
48
  warnings.filterwarnings("ignore")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # Convert m4a audio to wav format
51
  def convert_to_wav(audio_file):
 
53
  wav_file = "temp.wav"
54
  audio.export(wav_file, format="wav")
55
  return wav_file
 
 
 
 
 
56
  #---------------------------------------------------------------------------------------------------------------------------------------------
57
  processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
58
  model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
59
  model.cuda() # device = 0 if torch.cuda.is_available() else "cpu"
60
+ #---------------------------------------------------------------------------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  generate_kwargs = {
62
  "num_beams": 5,
63
  "language": "no",
 
66
  }
67
 
68
  def transcribe_audio(audio_file, chunk_length_s=30):
 
 
 
 
 
 
69
  if audio_file.endswith(".m4a"):
70
  audio_file = convert_to_wav(audio_file)
71
 
72
  start_time = time.time()
73
+ # Load waveform w/ torchaudio
74
  waveform, sample_rate = torchaudio.load(audio_file)
75
 
76
+ # Convert to mono
77
  if waveform.shape[0] > 1:
78
  waveform = torch.mean(waveform, dim=0, keepdim=True)
79
 
 
82
  waveform = resampler(waveform)
83
  sample_rate = 16000
84
 
85
+ # Calculate number of chunks
86
  chunk_size = chunk_length_s * sample_rate
87
  num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
88
 
89
+ # Initialize empty list; stores transcribed text from ea.chunk
90
  full_text = []
91
 
92
  for i in range(num_chunks):
 
94
  end = min((i + 1) * chunk_size, waveform.shape[1])
95
  chunk_waveform = waveform[:, start:end]
96
 
97
+ # Check chunk waveform properly shaped
98
  if chunk_waveform.shape[0] > 1:
99
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
 
 
 
 
 
100
  #---------------------------------------------------------------------------------------------------------------------------------------------
 
101
  # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
102
  inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True)
103
  inputs = inputs.to("cuda", torch.float32)
 
109
  # transcription
110
  chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
111
  #---------------------------------------------------------------------------------------------------------------------------------------------
 
 
 
 
 
112
  full_text.append(chunk_text)
 
113
  text = " ".join(full_text)
114
 
115
  output_time = time.time() - start_time
116
 
117
+ # (in seconds)
118
  audio_duration = waveform.shape[1] / sample_rate
119
  # Real-time Factor (RTF)
120
  rtf = output_time / audio_duration
 
129
  "It is the ratio of transcription time to the duration of the audio.\n\n"
130
  "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
131
  )
 
 
 
 
 
 
132
 
133
  return text, result
134
  #---------------------------------------------------------------------------------------------------------------------------------------------
135
+
136
+ # Clean and preprocess text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  def clean_text(text):
138
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
139
  text = re.sub(r'[^\w\s]', '', text)
 
159
  inputs = inputs.to(device)
160
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
161
  return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
162
+
163
  # Builds similarity matrix
164
  def build_similarity_matrix(sentences, stop_words):
165
  similarity_matrix = nx.Graph()