Spaces:

camparchimedes
/

nb

Build error

App Files Files

camparchimedes commited on Aug 22, 2024

Commit

04be9ad

verified ·

1 Parent(s): 6b25b5a

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -22

app.py CHANGED Viewed

@@ -37,6 +37,14 @@ import gradio as gr
 from fpdf import FPDF
 from PIL import Image
 # Suppress warnings
 warnings.filterwarnings("ignore")
@@ -104,34 +112,25 @@ def transcribe_audio(audio_file, chunk_length_s=30):
         if chunk_waveform.shape[0] > 1:
             chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
-        # Process chunk with tokenizer
-        inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
-        input_features = inputs.input_features
-        # Create attention mask
-        attention_mask = torch.ones(inputs.input_features.shape[:2], dtype=torch.long, device=device)
-        # -- does not output input_ids (i.e, processor)
-        # input_ids = inputs['input_ids']
-        # attention_mask[input_ids == processor.tokenizer.pad_token_id] = 0
-        # Set the attention mask to zero for padding tokens
-        attention_mask[inputs.input_features.squeeze(0) == processor.tokenizer.pad_token_id] = 0
         # ASR model inference on the chunk
         with torch.no_grad():
             generated_ids = model.generate(
-                input_features=input_features.to(device),
-                attention_mask=attention_mask.to(device),
                 **generate_kwargs
-            )
-            # new processor object with desired configuration
-            #new_processor = processor.add_special_tokens({'eos_token': '[EOS]'})
-            chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            full_text.append(chunk_text)
     # Combine the transcribed text from all chunks
     text = " ".join(full_text)
@@ -269,8 +268,8 @@ def save_to_pdf(text, summary):
 iface = gr.Blocks()
 with iface:
-    gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" width="100%" height="auto"/>')
-    gr.Markdown("**Switch Work webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
     with gr.Tabs():
         with gr.TabItem("Transcription"):

 from fpdf import FPDF
 from PIL import Image
+PLACEHOLDER = """
+<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <img src=""https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.93;  ">
+   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Switch Work | Verktæysett no.1</h1>
+   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
+</div>
+"""
 # Suppress warnings
 warnings.filterwarnings("ignore")
         if chunk_waveform.shape[0] > 1:
             chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
+        # Tokenize the input batch with the processor
+        inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt", task="transcribe")
+        # Use the attention mask directly from the tokenizer output
+        attention_mask = inputs.attention_mask.to(device)
         # ASR model inference on the chunk
         with torch.no_grad():
             generated_ids = model.generate(
+                input_features=inputs.input_features.to(device),
+                attention_mask=attention_mask,
                 **generate_kwargs
+        )
+    # Decode the generated IDs to text
+    chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    full_text.append(chunk_text)
     # Combine the transcribed text from all chunks
     text = " ".join(full_text)
 iface = gr.Blocks()
 with iface:
+    #gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.55; >')
+    #gr.Markdown("**Switch Work webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download**")
     with gr.Tabs():
         with gr.TabItem("Transcription"):