Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -22,6 +22,7 @@ import warnings
|
|
22 |
from pydub import AudioSegment
|
23 |
import torch
|
24 |
import torchaudio
|
|
|
25 |
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
|
26 |
from huggingface_hub import model_info
|
27 |
import spacy
|
@@ -75,9 +76,15 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
75 |
|
76 |
start_time = time.time()
|
77 |
|
78 |
-
# Load
|
79 |
waveform, sample_rate = torchaudio.load(audio_file)
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
# Calculate the number of chunks
|
82 |
chunk_size = chunk_length_s * sample_rate
|
83 |
num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
|
@@ -107,12 +114,12 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
107 |
chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
108 |
full_text.append(chunk_text)
|
109 |
|
110 |
-
# Combine
|
111 |
text = " ".join(full_text)
|
112 |
|
113 |
output_time = time.time() - start_time
|
114 |
|
115 |
-
#
|
116 |
audio_duration = waveform.shape[1] / sample_rate
|
117 |
|
118 |
# Real-time Factor (RTF)
|
@@ -132,26 +139,26 @@ def transcribe_audio(audio_file, chunk_length_s=30):
|
|
132 |
return text, result
|
133 |
|
134 |
|
135 |
-
# Clean and preprocess
|
136 |
def clean_text(text):
|
137 |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
|
138 |
text = re.sub(r'[^\w\s]', '', text)
|
139 |
text = re.sub(r'\s+', ' ', text).strip()
|
140 |
return text
|
141 |
|
142 |
-
nlp = spacy.blank("nb") # 'nb'
|
143 |
spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
|
144 |
|
145 |
def preprocess_text(text):
|
146 |
# Process the text with SpaCy
|
147 |
doc = nlp(text)
|
148 |
-
#
|
149 |
stop_words = spacy_stop_words
|
150 |
# Filter out stop words
|
151 |
words = [token.text for token in doc if token.text.lower() not in stop_words]
|
152 |
return ' '.join(words)
|
153 |
|
154 |
-
# Summarize
|
155 |
def summarize_text(text):
|
156 |
preprocessed_text = preprocess_text(text)
|
157 |
inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
|
@@ -159,7 +166,7 @@ def summarize_text(text):
|
|
159 |
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
|
160 |
return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
161 |
|
162 |
-
#
|
163 |
def build_similarity_matrix(sentences, stop_words):
|
164 |
similarity_matrix = nx.Graph()
|
165 |
for i, tokens_a in enumerate(sentences):
|
@@ -169,7 +176,7 @@ def build_similarity_matrix(sentences, stop_words):
|
|
169 |
similarity_matrix.add_edge(i, j, weight=len(common_words))
|
170 |
return similarity_matrix
|
171 |
|
172 |
-
# Graph-based summarization
|
173 |
def graph_based_summary(text, num_paragraphs=3):
|
174 |
doc = nlp(text)
|
175 |
sentences = [sent.text for sent in doc.sents]
|
@@ -185,7 +192,7 @@ def graph_based_summary(text, num_paragraphs=3):
|
|
185 |
ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
|
186 |
return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
|
187 |
|
188 |
-
# LexRank
|
189 |
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
|
190 |
doc = nlp(text)
|
191 |
sentences = [sent.text for sent in doc.sents]
|
@@ -197,14 +204,14 @@ def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
|
|
197 |
X = vectorizer.fit_transform(sentences)
|
198 |
similarity_matrix = cosine_similarity(X, X)
|
199 |
|
200 |
-
# Apply threshold
|
201 |
similarity_matrix[similarity_matrix < threshold] = 0
|
202 |
nx_graph = nx.from_numpy_array(similarity_matrix)
|
203 |
scores = nx.pagerank(nx_graph)
|
204 |
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
|
205 |
return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
|
206 |
|
207 |
-
# TextRank
|
208 |
def text_rank_summary(text, num_paragraphs=3):
|
209 |
doc = nlp(text)
|
210 |
sentences = [sent.text for sent in doc.sents]
|
@@ -222,7 +229,7 @@ def text_rank_summary(text, num_paragraphs=3):
|
|
222 |
return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
|
223 |
|
224 |
|
225 |
-
# Save text
|
226 |
def save_to_pdf(text, summary):
|
227 |
pdf = FPDF()
|
228 |
pdf.add_page()
|
@@ -243,7 +250,7 @@ def save_to_pdf(text, summary):
|
|
243 |
iface = gr.Blocks()
|
244 |
|
245 |
with iface:
|
246 |
-
gr.
|
247 |
gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
|
248 |
|
249 |
with gr.Tabs():
|
|
|
22 |
from pydub import AudioSegment
|
23 |
import torch
|
24 |
import torchaudio
|
25 |
+
import torchaudio.transforms as transforms
|
26 |
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
|
27 |
from huggingface_hub import model_info
|
28 |
import spacy
|
|
|
76 |
|
77 |
start_time = time.time()
|
78 |
|
79 |
+
# Load waveform/torchaudio
|
80 |
waveform, sample_rate = torchaudio.load(audio_file)
|
81 |
|
82 |
+
# Resample to 16000 Hz if 32K
|
83 |
+
if sample_rate != 16000:
|
84 |
+
resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
85 |
+
waveform = resampler(waveform)
|
86 |
+
sample_rate = 16000
|
87 |
+
|
88 |
# Calculate the number of chunks
|
89 |
chunk_size = chunk_length_s * sample_rate
|
90 |
num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)
|
|
|
114 |
chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
115 |
full_text.append(chunk_text)
|
116 |
|
117 |
+
# Combine transcribed text/all chunks
|
118 |
text = " ".join(full_text)
|
119 |
|
120 |
output_time = time.time() - start_time
|
121 |
|
122 |
+
# Dduration (secs)
|
123 |
audio_duration = waveform.shape[1] / sample_rate
|
124 |
|
125 |
# Real-time Factor (RTF)
|
|
|
139 |
return text, result
|
140 |
|
141 |
|
142 |
+
# Clean and preprocess/@summarization
|
143 |
def clean_text(text):
|
144 |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
|
145 |
text = re.sub(r'[^\w\s]', '', text)
|
146 |
text = re.sub(r'\s+', ' ', text).strip()
|
147 |
return text
|
148 |
|
149 |
+
nlp = spacy.blank("nb") # 'nb' ==> codename = Norwegian Bokmål
|
150 |
spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
|
151 |
|
152 |
def preprocess_text(text):
|
153 |
# Process the text with SpaCy
|
154 |
doc = nlp(text)
|
155 |
+
# SpaCy's stop top wrds direct
|
156 |
stop_words = spacy_stop_words
|
157 |
# Filter out stop words
|
158 |
words = [token.text for token in doc if token.text.lower() not in stop_words]
|
159 |
return ' '.join(words)
|
160 |
|
161 |
+
# Summarize w/T5 model
|
162 |
def summarize_text(text):
|
163 |
preprocessed_text = preprocess_text(text)
|
164 |
inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
|
|
|
166 |
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
|
167 |
return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
168 |
|
169 |
+
# Builds similarity matrix
|
170 |
def build_similarity_matrix(sentences, stop_words):
|
171 |
similarity_matrix = nx.Graph()
|
172 |
for i, tokens_a in enumerate(sentences):
|
|
|
176 |
similarity_matrix.add_edge(i, j, weight=len(common_words))
|
177 |
return similarity_matrix
|
178 |
|
179 |
+
# "Graph-based summarization" =====>
|
180 |
def graph_based_summary(text, num_paragraphs=3):
|
181 |
doc = nlp(text)
|
182 |
sentences = [sent.text for sent in doc.sents]
|
|
|
192 |
ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
|
193 |
return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
|
194 |
|
195 |
+
# LexRank
|
196 |
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
|
197 |
doc = nlp(text)
|
198 |
sentences = [sent.text for sent in doc.sents]
|
|
|
204 |
X = vectorizer.fit_transform(sentences)
|
205 |
similarity_matrix = cosine_similarity(X, X)
|
206 |
|
207 |
+
# Apply threshold@similarity matrix
|
208 |
similarity_matrix[similarity_matrix < threshold] = 0
|
209 |
nx_graph = nx.from_numpy_array(similarity_matrix)
|
210 |
scores = nx.pagerank(nx_graph)
|
211 |
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
|
212 |
return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
|
213 |
|
214 |
+
# TextRank
|
215 |
def text_rank_summary(text, num_paragraphs=3):
|
216 |
doc = nlp(text)
|
217 |
sentences = [sent.text for sent in doc.sents]
|
|
|
229 |
return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
|
230 |
|
231 |
|
232 |
+
# Save text+summary/PDF
|
233 |
def save_to_pdf(text, summary):
|
234 |
pdf = FPDF()
|
235 |
pdf.add_page()
|
|
|
250 |
iface = gr.Blocks()
|
251 |
|
252 |
with iface:
|
253 |
+
gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/blob/main/banner_trans.png" alt="Banner Image" />')
|
254 |
gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")
|
255 |
|
256 |
with gr.Tabs():
|