camparchimedes commited on
Commit
04be9ad
·
verified ·
1 Parent(s): 6b25b5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -22
app.py CHANGED
@@ -37,6 +37,14 @@ import gradio as gr
37
  from fpdf import FPDF
38
  from PIL import Image
39
 
 
 
 
 
 
 
 
 
40
  # Suppress warnings
41
  warnings.filterwarnings("ignore")
42
 
@@ -104,34 +112,25 @@ def transcribe_audio(audio_file, chunk_length_s=30):
104
  if chunk_waveform.shape[0] > 1:
105
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
106
 
107
- # Process chunk with tokenizer
108
- inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
109
- input_features = inputs.input_features
110
-
111
- # Create attention mask
112
- attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
113
-
114
- # -- does not output input_ids (i.e, processor)
115
- # input_ids = inputs['input_ids']
116
- # attention_mask[input_ids == processor.tokenizer.pad_token_id] = 0
117
 
 
 
118
 
119
- # Set the attention mask to zero for padding tokens
120
- attention_mask[inputs.input_features.squeeze(0) == processor.tokenizer.pad_token_id] = 0
121
-
122
 
123
  # ASR model inference on the chunk
124
  with torch.no_grad():
125
  generated_ids = model.generate(
126
- input_features=input_features.to(device),
127
- attention_mask=attention_mask.to(device),
128
  **generate_kwargs
129
- )
 
 
 
 
130
 
131
- # new processor object with desired configuration
132
- #new_processor = processor.add_special_tokens({'eos_token': '[EOS]'})
133
- chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
134
- full_text.append(chunk_text)
135
 
136
  # Combine the transcribed text from all chunks
137
  text = " ".join(full_text)
@@ -269,8 +268,8 @@ def save_to_pdf(text, summary):
269
  iface = gr.Blocks()
270
 
271
  with iface:
272
- gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" width="100%" height="auto"/>')
273
- gr.Markdown("**Switch Work webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
274
 
275
  with gr.Tabs():
276
  with gr.TabItem("Transcription"):
 
37
  from fpdf import FPDF
38
  from PIL import Image
39
 
40
+
41
+ PLACEHOLDER = """
42
+ <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
43
+ <img src=""https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.93; ">
44
+ <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Switch Work | Verktæysett no.1</h1>
45
+ <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
46
+ </div>
47
+ """
48
  # Suppress warnings
49
  warnings.filterwarnings("ignore")
50
 
 
112
  if chunk_waveform.shape[0] > 1:
113
  chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
114
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ # Tokenize the input batch with the processor
117
+ inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt", task="transcribe")
118
 
119
+ # Use the attention mask directly from the tokenizer output
120
+ attention_mask = inputs.attention_mask.to(device)
 
121
 
122
  # ASR model inference on the chunk
123
  with torch.no_grad():
124
  generated_ids = model.generate(
125
+ input_features=inputs.input_features.to(device),
126
+ attention_mask=attention_mask,
127
  **generate_kwargs
128
+ )
129
+
130
+ # Decode the generated IDs to text
131
+ chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
132
+ full_text.append(chunk_text)
133
 
 
 
 
 
134
 
135
  # Combine the transcribed text from all chunks
136
  text = " ".join(full_text)
 
268
  iface = gr.Blocks()
269
 
270
  with iface:
271
+ #gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.55; >')
272
+ #gr.Markdown("**Switch Work webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
273
 
274
  with gr.Tabs():
275
  with gr.TabItem("Transcription"):