Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -76,10 +76,14 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
76 |
|
77 |
start_time = time.time()
|
78 |
|
79 |
-
# Load waveform
|
80 |
waveform, sample_rate = torchaudio.load(audio_file)
|
81 |
|
82 |
-
#
|
|
|
|
|
|
|
|
|
83 |
if sample_rate != 16000:
|
84 |
resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
85 |
waveform = resampler(waveform)
|
@@ -97,8 +101,12 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
97 |
end = min((i + 1) * chunk_size, waveform.shape[1])
|
98 |
chunk_waveform = waveform[:, start:end]
|
99 |
|
|
|
|
|
|
|
|
|
100 |
# Process the chunk
|
101 |
-
audio_input = processor(chunk_waveform, sampling_rate=sample_rate, return_tensors="pt")
|
102 |
|
103 |
# Generate attention mask
|
104 |
input_features = audio_input.input_features
|
@@ -114,12 +122,12 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
114 |
chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
115 |
full_text.append(chunk_text)
|
116 |
|
117 |
-
# Combine transcribed text
|
118 |
text = " ".join(full_text)
|
119 |
|
120 |
output_time = time.time() - start_time
|
121 |
|
122 |
-
#
|
123 |
audio_duration = waveform.shape[1] / sample_rate
|
124 |
|
125 |
# Real-time Factor (RTF)
|
@@ -139,6 +147,7 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
139 |
return text, result
|
140 |
|
141 |
|
|
|
142 |
# Clean and preprocess/@summarization
|
143 |
def clean_text(text):
|
144 |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
|
@@ -250,7 +259,7 @@ def save_to_pdf(text, summary):
|
|
250 |
iface = gr.Blocks()
|
251 |
|
252 |
with iface:
|
253 |
-
gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/
|
254 |
gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
|
255 |
|
256 |
with gr.Tabs():
|
|
|
76 |
|
77 |
start_time = time.time()
|
78 |
|
79 |
+
# Load the audio waveform using torchaudio
|
80 |
waveform, sample_rate = torchaudio.load(audio_file)
|
81 |
|
82 |
+
# Convert to mono if the audio has more than one channel
|
83 |
+
if waveform.shape[0] > 1:
|
84 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
85 |
+
|
86 |
+
# Resample the audio to 16000 Hz if it’s not already
|
87 |
if sample_rate != 16000:
|
88 |
resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
89 |
waveform = resampler(waveform)
|
|
|
101 |
end = min((i + 1) * chunk_size, waveform.shape[1])
|
102 |
chunk_waveform = waveform[:, start:end]
|
103 |
|
104 |
+
# Ensure the chunk waveform is properly shaped
|
105 |
+
if chunk_waveform.shape[0] > 1:
|
106 |
+
chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)
|
107 |
+
|
108 |
# Process the chunk
|
109 |
+
audio_input = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")
|
110 |
|
111 |
# Generate attention mask
|
112 |
input_features = audio_input.input_features
|
|
|
122 |
chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
123 |
full_text.append(chunk_text)
|
124 |
|
125 |
+
# Combine the transcribed text from all chunks
|
126 |
text = " ".join(full_text)
|
127 |
|
128 |
output_time = time.time() - start_time
|
129 |
|
130 |
+
# Audio duration (in seconds)
|
131 |
audio_duration = waveform.shape[1] / sample_rate
|
132 |
|
133 |
# Real-time Factor (RTF)
|
|
|
147 |
return text, result
|
148 |
|
149 |
|
150 |
+
|
151 |
# Clean and preprocess/@summarization
|
152 |
def clean_text(text):
|
153 |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
|
|
|
259 |
iface = gr.Blocks()
|
260 |
|
261 |
with iface:
|
262 |
+
gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/resolve/main/banner_trans.png" alt="Banner Image" />')
|
263 |
gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
|
264 |
|
265 |
with gr.Tabs():
|