Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
# app.py
|
2 |
-
# Version: 1.
|
3 |
#---------------------------------------------------------------------------------------------------------------------------------------------
|
4 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
# you may not use this file except in compliance with the License.
|
@@ -27,7 +27,7 @@ import torch
|
|
27 |
import torchaudio
|
28 |
import torchaudio.transforms as transforms
|
29 |
|
30 |
-
from transformers import
|
31 |
|
32 |
import spacy
|
33 |
import networkx as nx
|
@@ -39,100 +39,41 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
39 |
import gradio as gr
|
40 |
from fpdf import FPDF
|
41 |
from PIL import Image
|
42 |
-
# from huggingface_hub import model_info
|
43 |
|
44 |
warnings.filterwarnings("ignore")
|
45 |
|
46 |
-
""""
|
47 |
-
# Convert m4a audio to wav format
|
48 |
def convert_to_wav(audio_file):
|
49 |
audio = AudioSegment.from_file(audio_file, format="m4a")
|
50 |
wav_file = "temp.wav"
|
51 |
audio.export(wav_file, format="wav")
|
52 |
return wav_file
|
53 |
-
"""
|
54 |
-
|
55 |
-
#---------------------------------------------------------------------------------------------------------------------------------------------
|
56 |
-
|
57 |
-
processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
|
58 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
|
59 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #model.cuda()
|
60 |
-
model.to(device)
|
61 |
|
|
|
|
|
62 |
|
63 |
-
|
64 |
"num_beams": 5,
|
65 |
"language": "no",
|
66 |
-
"task": "transcribe",
|
67 |
-
"forced_decoder_ids": None # ALT. generation_config.forced_decoder_ids = None
|
68 |
}
|
69 |
|
70 |
-
|
71 |
-
def transcribe_audio(audio_file
|
72 |
-
|
73 |
-
|
74 |
|
75 |
start_time = time.time()
|
76 |
-
|
77 |
-
waveform, sample_rate = torchaudio.load(audio_file)
|
78 |
-
|
79 |
-
# Convert to mono
|
80 |
-
if waveform.shape[0] > 1:
|
81 |
-
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
82 |
-
|
83 |
-
if sample_rate != 16000:
|
84 |
-
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
85 |
-
waveform = resampler(waveform)
|
86 |
-
sample_rate = 16000
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
|
91 |
|
92 |
-
|
|
|
|
|
93 |
|
94 |
-
|
95 |
-
start = i * chunk_size
|
96 |
-
end = min((i + 1) * chunk_size, waveform.shape[1])
|
97 |
-
chunk_waveform = waveform[:, start:end]
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
#---------------------------------------------------------------------------------------------------------------------------------------------
|
102 |
-
# make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
|
103 |
-
inputs = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True)
|
104 |
-
inputs = inputs.to(device)
|
105 |
-
input_features = inputs.input_features # alt. input_features = inputs['input_features']
|
106 |
-
attention_mask = inputs.attention_mask # inputs['attention_mask']
|
107 |
-
# transcribe audio to ids
|
108 |
-
generated_ids = model.generate(inputs=input_features, attention_mask=attention_mask, **generate_kwargs) # Pass the attention mask
|
109 |
-
|
110 |
-
# transcription
|
111 |
-
chunk_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
112 |
-
#---------------------------------------------------------------------------------------------------------------------------------------------
|
113 |
-
full_text.append(chunk_text)
|
114 |
-
text = " ".join(full_text)
|
115 |
-
|
116 |
-
output_time = time.time() - start_time
|
117 |
-
|
118 |
-
# (in seconds)
|
119 |
-
audio_duration = waveform.shape[1] / sample_rate
|
120 |
-
# Real-time Factor (RTF)
|
121 |
-
rtf = output_time / audio_duration
|
122 |
-
|
123 |
-
# Format of the result
|
124 |
-
result = (
|
125 |
-
f"Time taken: {output_time:.2f} seconds\n"
|
126 |
-
f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
|
127 |
-
f"Real-time Factor (RTF): {rtf:.2f}\n"
|
128 |
-
f"Number of words: {len(text.split())}\n\n"
|
129 |
-
"Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
|
130 |
-
"It is the ratio of transcription time to the duration of the audio.\n\n"
|
131 |
-
"An RTF of less than 1 means the transcription process is faster than real-time (expected)."
|
132 |
-
)
|
133 |
-
|
134 |
-
return text, result
|
135 |
-
#---------------------------------------------------------------------------------------------------------------------------------------------
|
136 |
|
137 |
# Clean and preprocess text
|
138 |
def clean_text(text):
|
@@ -244,12 +185,12 @@ def save_to_pdf(text, summary):
|
|
244 |
|
245 |
iface = gr.Blocks()
|
246 |
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
"""
|
254 |
|
255 |
with iface:
|
@@ -259,8 +200,8 @@ with iface:
|
|
259 |
with gr.Tabs():
|
260 |
with gr.TabItem("Transcription"):
|
261 |
audio_input = gr.Audio(type="filepath")
|
262 |
-
text_output = gr.Textbox(label="
|
263 |
-
result_output = gr.Textbox(label="
|
264 |
transcribe_button = gr.Button("Transcribe")
|
265 |
|
266 |
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
|
@@ -298,3 +239,4 @@ iface.launch(share=True, debug=True)
|
|
298 |
|
299 |
|
300 |
|
|
|
|
1 |
# app.py
|
2 |
+
# Version: 1.07 (08.24.24), ALPHA
|
3 |
#---------------------------------------------------------------------------------------------------------------------------------------------
|
4 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
# you may not use this file except in compliance with the License.
|
|
|
27 |
import torchaudio
|
28 |
import torchaudio.transforms as transforms
|
29 |
|
30 |
+
from transformers import pipeline
|
31 |
|
32 |
import spacy
|
33 |
import networkx as nx
|
|
|
39 |
import gradio as gr
|
40 |
from fpdf import FPDF
|
41 |
from PIL import Image
|
|
|
42 |
|
43 |
warnings.filterwarnings("ignore")
|
44 |
|
|
|
|
|
45 |
def convert_to_wav(audio_file):
|
46 |
audio = AudioSegment.from_file(audio_file, format="m4a")
|
47 |
wav_file = "temp.wav"
|
48 |
audio.export(wav_file, format="wav")
|
49 |
return wav_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
+
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
52 |
+
asr = pipeline("automatic-speech-recognition", "NbAiLabBeta/nb-whisper-large-semantic")
|
53 |
|
54 |
+
kwargs = {
|
55 |
"num_beams": 5,
|
56 |
"language": "no",
|
|
|
|
|
57 |
}
|
58 |
|
59 |
+
# funct.@ASR,
|
60 |
+
def transcribe_audio(audio_file):
|
61 |
+
if audio_file.endswith(".m4a"):
|
62 |
+
audio_file = convert_to_wav(audio_file)
|
63 |
|
64 |
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
outputs = asr(audio_file, forced_decoder_ids=None, task="transcribe", batch_size=16, return_timestamps=False, **kwargs) # chunk_length_s=30,
|
67 |
+
text = outputs["text"]
|
|
|
68 |
|
69 |
+
end_time = time.time()
|
70 |
+
output_time = end_time - start_time
|
71 |
+
word_count = len(text.split())
|
72 |
|
73 |
+
result = f"Transcription: {text.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
|
|
|
|
|
|
|
74 |
|
75 |
+
return text.strip(), result
|
76 |
+
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
# Clean and preprocess text
|
79 |
def clean_text(text):
|
|
|
185 |
|
186 |
iface = gr.Blocks()
|
187 |
|
188 |
+
|
189 |
+
title = """# Velkommen til 🌟>Switch Work | Verktæysett no.1✨
|
190 |
+
En webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download</p>
|
191 |
+
|
192 |
+
Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [](https://discord.gg/GWpVpekp) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer)
|
193 |
+
Math 🔍 [introspector](https://huggingface.co/introspector) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [SciTonic](https://github.com/Tonic-AI/scitonic)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
|
194 |
"""
|
195 |
|
196 |
with iface:
|
|
|
200 |
with gr.Tabs():
|
201 |
with gr.TabItem("Transcription"):
|
202 |
audio_input = gr.Audio(type="filepath")
|
203 |
+
text_output = gr.Textbox(label="Transcription")
|
204 |
+
result_output = gr.Textbox(label="Details")
|
205 |
transcribe_button = gr.Button("Transcribe")
|
206 |
|
207 |
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
|
|
|
239 |
|
240 |
|
241 |
|
242 |
+
|