Spaces:

camparchimedes
/

nb

Build error

App Files Files

camparchimedes commited on Aug 24, 2024

Commit

32f88c0

verified ·

1 Parent(s): 6c5695c

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -84

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # app.py
-# Version: 1.06 (08.24.24)
 #---------------------------------------------------------------------------------------------------------------------------------------------
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ import torch
 import torchaudio
 import torchaudio.transforms as transforms
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import spacy
 import networkx as nx
@@ -39,100 +39,41 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import gradio as gr
 from fpdf import FPDF
 from PIL import Image
-# from huggingface_hub import model_info
 warnings.filterwarnings("ignore")
-""""
-# Convert m4a audio to wav format
 def convert_to_wav(audio_file):
     audio = AudioSegment.from_file(audio_file, format="m4a")
     wav_file = "temp.wav"
     audio.export(wav_file, format="wav")
     return wav_file
-"""
-#---------------------------------------------------------------------------------------------------------------------------------------------
-processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
-model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #model.cuda()
-model.to(device)
-generate_kwargs = {
     "num_beams": 5,
     "language": "no",
-    "task": "transcribe",
-    "forced_decoder_ids": None # ALT. generation_config.forced_decoder_ids = None
 }
-def transcribe_audio(audio_file, chunk_length_s=30):
-    #if audio_file.endswith(".m4a"):
-        #audio_file = convert_to_wav(audio_file)
     start_time = time.time()
-    waveform, sample_rate = torchaudio.load(audio_file)
-    # Convert to mono
-    if waveform.shape[0] > 1:
-        waveform = torch.mean(waveform, dim=0, keepdim=True)
-    if sample_rate != 16000:
-        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
-        waveform = resampler(waveform)
-        sample_rate = 16000
-    # Calculate number of chunks
-    chunk_size = chunk_length_s * sample_rate
-    num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
-    full_text = [] # --stores transcribed text (‘initialize empty list‘)
-    for i in range(num_chunks):
-        start = i * chunk_size
-        end = min((i + 1) * chunk_size, waveform.shape[1])
-        chunk_waveform = waveform[:, start:end]
-        if chunk_waveform.shape[0] > 1:
-            chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
-#---------------------------------------------------------------------------------------------------------------------------------------------
-        # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
-        inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True)
-        inputs = inputs.to(device)
-        input_features = inputs.input_features # alt. input_features = inputs['input_features']
-        attention_mask = inputs.attention_mask # inputs['attention_mask']
-        # transcribe audio to ids
-        generated_ids = model.generate(inputs=input_features, attention_mask=attention_mask, **generate_kwargs)   # Pass the attention mask
-    # transcription
-    chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-#---------------------------------------------------------------------------------------------------------------------------------------------
-    full_text.append(chunk_text)
-    text = " ".join(full_text)
-    output_time = time.time() - start_time
-    # (in seconds)
-    audio_duration = waveform.shape[1] / sample_rate
-    # Real-time Factor (RTF)
-    rtf = output_time / audio_duration
-    # Format of the result
-    result = (
-    f"Time taken: {output_time:.2f} seconds\n"
-    f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
-    f"Real-time Factor (RTF): {rtf:.2f}\n"
-    f"Number of words: {len(text.split())}\n\n"
-    "Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
-    "It is the ratio of transcription time to the duration of the audio.\n\n"
-    "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
-    )
-    return text, result
-#---------------------------------------------------------------------------------------------------------------------------------------------
 # Clean and preprocess text
 def clean_text(text):
@@ -244,12 +185,12 @@ def save_to_pdf(text, summary):
 iface = gr.Blocks()
-PLACEHOLDER = """
-<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <img src=""https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.93;  ">
-   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Switch Work | Verktæysett no.1</h1>
-   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
-</div>
 """
 with iface:
@@ -259,8 +200,8 @@ with iface:
     with gr.Tabs():
         with gr.TabItem("Transcription"):
             audio_input = gr.Audio(type="filepath")
-            text_output = gr.Textbox(label="Text")
-            result_output = gr.Textbox(label="Transcription Details")
             transcribe_button = gr.Button("Transcribe")
             transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
@@ -298,3 +239,4 @@ iface.launch(share=True, debug=True)

 # app.py
+# Version: 1.07 (08.24.24), ALPHA
 #---------------------------------------------------------------------------------------------------------------------------------------------
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 import torchaudio
 import torchaudio.transforms as transforms
+from transformers import pipeline
 import spacy
 import networkx as nx
 import gradio as gr
 from fpdf import FPDF
 from PIL import Image
 warnings.filterwarnings("ignore")
 def convert_to_wav(audio_file):
     audio = AudioSegment.from_file(audio_file, format="m4a")
     wav_file = "temp.wav"
     audio.export(wav_file, format="wav")
     return wav_file
+#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large-semantic")
+kwargs = {
     "num_beams": 5,
     "language": "no",
 }
+# funct.@ASR,
+def transcribe_audio(audio_file):
+     if audio_file.endswith(".m4a"):
+        audio_file = convert_to_wav(audio_file)
     start_time = time.time()
+    outputs = asr(audio_file, forced_decoder_ids=None, task="transcribe", batch_size=16, return_timestamps=False, **kwargs)  # chunk_length_s=30,
+    text = outputs["text"]
+    end_time = time.time()
+    output_time = end_time - start_time
+    word_count = len(text.split())
+    result = f"Transcription: {text.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
+    return text.strip(), result
+#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 # Clean and preprocess text
 def clean_text(text):
 iface = gr.Blocks()
+title = """# Velkommen til 🌟>Switch Work | Verktæysett no.1✨
+En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
+Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer)
+Math 🔍 [introspector](https://huggingface.co/introspector) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [SciTonic](https://github.com/Tonic-AI/scitonic)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 """
 with iface:
     with gr.Tabs():
         with gr.TabItem("Transcription"):
             audio_input = gr.Audio(type="filepath")
+            text_output = gr.Textbox(label="Transcription")
+            result_output = gr.Textbox(label="Details")
             transcribe_button = gr.Button("Transcribe")
             transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])