camparchimedes commited on
Commit
32f88c0
·
verified ·
1 Parent(s): 6c5695c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -84
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # app.py
2
- # Version: 1.06 (08.24.24)
3
  #---------------------------------------------------------------------------------------------------------------------------------------------
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ import torch
27
  import torchaudio
28
  import torchaudio.transforms as transforms
29
 
30
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
31
 
32
  import spacy
33
  import networkx as nx
@@ -39,100 +39,41 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
39
  import gradio as gr
40
  from fpdf import FPDF
41
  from PIL import Image
42
- # from huggingface_hub import model_info
43
 
44
  warnings.filterwarnings("ignore")
45
 
46
- """"
47
- # Convert m4a audio to wav format
48
  def convert_to_wav(audio_file):
49
  audio = AudioSegment.from_file(audio_file, format="m4a")
50
  wav_file = "temp.wav"
51
  audio.export(wav_file, format="wav")
52
  return wav_file
53
- """
54
-
55
- #---------------------------------------------------------------------------------------------------------------------------------------------
56
-
57
- processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
58
- model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
59
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #model.cuda()
60
- model.to(device)
61
 
 
 
62
 
63
- generate_kwargs = {
64
  "num_beams": 5,
65
  "language": "no",
66
- "task": "transcribe",
67
- "forced_decoder_ids": None # ALT. generation_config.forced_decoder_ids = None
68
  }
69
 
70
-
71
- def transcribe_audio(audio_file, chunk_length_s=30):
72
- #if audio_file.endswith(".m4a"):
73
- #audio_file = convert_to_wav(audio_file)
74
 
75
  start_time = time.time()
76
-
77
- waveform, sample_rate = torchaudio.load(audio_file)
78
-
79
- # Convert to mono
80
- if waveform.shape[0] > 1:
81
- waveform = torch.mean(waveform, dim=0, keepdim=True)
82
-
83
- if sample_rate != 16000:
84
- resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
85
- waveform = resampler(waveform)
86
- sample_rate = 16000
87
 
88
- # Calculate number of chunks
89
- chunk_size = chunk_length_s * sample_rate
90
- num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
91
 
92
- full_text = [] # --stores transcribed text (‘initialize empty list‘)
 
 
93
 
94
- for i in range(num_chunks):
95
- start = i * chunk_size
96
- end = min((i + 1) * chunk_size, waveform.shape[1])
97
- chunk_waveform = waveform[:, start:end]
98
 
99
- if chunk_waveform.shape[0] > 1:
100
- chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
101
- #---------------------------------------------------------------------------------------------------------------------------------------------
102
- # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
103
- inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True)
104
- inputs = inputs.to(device)
105
- input_features = inputs.input_features # alt. input_features = inputs['input_features']
106
- attention_mask = inputs.attention_mask # inputs['attention_mask']
107
- # transcribe audio to ids
108
- generated_ids = model.generate(inputs=input_features, attention_mask=attention_mask, **generate_kwargs) # Pass the attention mask
109
-
110
- # transcription
111
- chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
112
- #---------------------------------------------------------------------------------------------------------------------------------------------
113
- full_text.append(chunk_text)
114
- text = " ".join(full_text)
115
-
116
- output_time = time.time() - start_time
117
-
118
- # (in seconds)
119
- audio_duration = waveform.shape[1] / sample_rate
120
- # Real-time Factor (RTF)
121
- rtf = output_time / audio_duration
122
-
123
- # Format of the result
124
- result = (
125
- f"Time taken: {output_time:.2f} seconds\n"
126
- f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
127
- f"Real-time Factor (RTF): {rtf:.2f}\n"
128
- f"Number of words: {len(text.split())}\n\n"
129
- "Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
130
- "It is the ratio of transcription time to the duration of the audio.\n\n"
131
- "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
132
- )
133
-
134
- return text, result
135
- #---------------------------------------------------------------------------------------------------------------------------------------------
136
 
137
  # Clean and preprocess text
138
  def clean_text(text):
@@ -244,12 +185,12 @@ def save_to_pdf(text, summary):
244
 
245
  iface = gr.Blocks()
246
 
247
- PLACEHOLDER = """
248
- <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
249
- <img src=""https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/pic09w9678yhit.png" alt="" style="width: 100%; height: auto; opacity: 0.93; ">
250
- <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Switch Work | Verktæysett no.1</h1>
251
- <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
252
- </div>
253
  """
254
 
255
  with iface:
@@ -259,8 +200,8 @@ with iface:
259
  with gr.Tabs():
260
  with gr.TabItem("Transcription"):
261
  audio_input = gr.Audio(type="filepath")
262
- text_output = gr.Textbox(label="Text")
263
- result_output = gr.Textbox(label="Transcription Details")
264
  transcribe_button = gr.Button("Transcribe")
265
 
266
  transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
@@ -298,3 +239,4 @@ iface.launch(share=True, debug=True)
298
 
299
 
300
 
 
 
1
  # app.py
2
+ # Version: 1.07 (08.24.24), ALPHA
3
  #---------------------------------------------------------------------------------------------------------------------------------------------
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
 
27
  import torchaudio
28
  import torchaudio.transforms as transforms
29
 
30
+ from transformers import pipeline
31
 
32
  import spacy
33
  import networkx as nx
 
39
  import gradio as gr
40
  from fpdf import FPDF
41
  from PIL import Image
 
42
 
43
  warnings.filterwarnings("ignore")
44
 
 
 
45
  def convert_to_wav(audio_file):
46
  audio = AudioSegment.from_file(audio_file, format="m4a")
47
  wav_file = "temp.wav"
48
  audio.export(wav_file, format="wav")
49
  return wav_file
 
 
 
 
 
 
 
 
50
 
51
+ #:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
52
+ asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large-semantic")
53
 
54
+ kwargs = {
55
  "num_beams": 5,
56
  "language": "no",
 
 
57
  }
58
 
59
+ # funct.@ASR,
60
+ def transcribe_audio(audio_file):
61
+ if audio_file.endswith(".m4a"):
62
+ audio_file = convert_to_wav(audio_file)
63
 
64
  start_time = time.time()
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ outputs = asr(audio_file, forced_decoder_ids=None, task="transcribe", batch_size=16, return_timestamps=False, **kwargs) # chunk_length_s=30,
67
+ text = outputs["text"]
 
68
 
69
+ end_time = time.time()
70
+ output_time = end_time - start_time
71
+ word_count = len(text.split())
72
 
73
+ result = f"Transcription: {text.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
 
 
 
74
 
75
+ return text.strip(), result
76
+ #:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # Clean and preprocess text
79
  def clean_text(text):
 
185
 
186
  iface = gr.Blocks()
187
 
188
+
189
+ title = """# Velkommen til 🌟>Switch Work | Verktæysett no.1✨
190
+ En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
191
+
192
+ Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer)
193
+ Math 🔍 [introspector](https://huggingface.co/introspector) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [SciTonic](https://github.com/Tonic-AI/scitonic)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
194
  """
195
 
196
  with iface:
 
200
  with gr.Tabs():
201
  with gr.TabItem("Transcription"):
202
  audio_input = gr.Audio(type="filepath")
203
+ text_output = gr.Textbox(label="Transcription")
204
+ result_output = gr.Textbox(label="Details")
205
  transcribe_button = gr.Button("Transcribe")
206
 
207
  transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
 
239
 
240
 
241
 
242
+