Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -1,49 +1,39 @@
|
|
1 |
-
import networkx as nx
|
2 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
3 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
-
import nltk
|
5 |
-
nltk.download('punkt')
|
6 |
-
from nltk.tokenize import sent_tokenize
|
7 |
|
8 |
-
import gradio as gr
|
9 |
-
import warnings
|
10 |
-
import torch
|
11 |
-
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
12 |
-
from pydub import AudioSegment
|
13 |
-
from fpdf import FPDF
|
14 |
-
from PIL import Image
|
15 |
import time
|
16 |
import os
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
warnings.filterwarnings("ignore")
|
|
|
21 |
|
22 |
-
|
23 |
-
torch_dtype = torch.float32
|
24 |
-
|
25 |
-
# ASR pipeline
|
26 |
-
asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch_dtype)
|
27 |
-
|
28 |
-
# Switch m4a to wav
|
29 |
def convert_to_wav(audio_file):
|
30 |
audio = AudioSegment.from_file(audio_file, format="m4a")
|
31 |
wav_file = "temp.wav"
|
32 |
audio.export(wav_file, format="wav")
|
33 |
return wav_file
|
34 |
|
35 |
-
# @spaces.GPU(queue=True)
|
36 |
|
37 |
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
def transcribe_audio(audio_file):
|
40 |
if audio_file.endswith(".m4a"):
|
41 |
audio_file = convert_to_wav(audio_file)
|
42 |
|
43 |
start_time = time.time()
|
44 |
|
|
|
45 |
with torch.no_grad():
|
46 |
-
output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8
|
47 |
|
48 |
transcription = output["text"]
|
49 |
end_time = time.time()
|
@@ -51,44 +41,141 @@ def transcribe_audio(audio_file):
|
|
51 |
output_time = end_time - start_time
|
52 |
word_count = len(transcription.split())
|
53 |
|
54 |
-
result = f"
|
55 |
|
56 |
return transcription.strip(), result
|
57 |
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
|
60 |
-
|
61 |
-
# t5-base to device
|
62 |
summarization_model.to(device)
|
63 |
|
64 |
-
# Graph-based summarization (TextRank, method)
|
65 |
def summarize_text(text):
|
66 |
-
|
67 |
-
if
|
68 |
-
return
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
nx_graph = nx.from_numpy_array(similarity_matrix)
|
75 |
-
scores = nx.
|
|
|
|
|
|
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
|
78 |
-
|
79 |
-
top_n = 3
|
80 |
-
summary = " ".join([s for _, s in ranked_sentences[:top_n]])
|
81 |
return summary
|
82 |
|
83 |
|
84 |
-
# HTML syntax for imagery
|
85 |
-
banner_html = """
|
86 |
-
<div style="text-align: center;">
|
87 |
-
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
|
88 |
-
</div>
|
89 |
-
"""
|
90 |
|
91 |
-
|
|
|
|
|
|
|
92 |
def save_to_pdf(transcription, summary):
|
93 |
pdf = FPDF()
|
94 |
pdf.add_page()
|
@@ -103,28 +190,35 @@ def save_to_pdf(transcription, summary):
|
|
103 |
if summary:
|
104 |
pdf.multi_cell(0, 10, "Summary:\n" + summary)
|
105 |
|
106 |
-
pdf_output_path = "
|
107 |
pdf.output(pdf_output_path)
|
108 |
return pdf_output_path
|
109 |
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
iface = gr.Interface(
|
112 |
fn=transcribe_audio,
|
113 |
inputs=gr.Audio(type="filepath"),
|
114 |
outputs="text",
|
115 |
-
title="
|
116 |
description="Upload an audio file to get the transcription",
|
117 |
theme="default",
|
118 |
live=False
|
119 |
)
|
|
|
120 |
iface = gr.Blocks()
|
121 |
|
122 |
with iface:
|
123 |
gr.HTML(banner_html)
|
124 |
-
gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift
|
125 |
|
126 |
with gr.Tabs():
|
127 |
-
|
128 |
with gr.TabItem("Transcription"):
|
129 |
audio_input = gr.Audio(type="filepath")
|
130 |
transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
|
@@ -134,22 +228,21 @@ with iface:
|
|
134 |
def transcribe(audio_file):
|
135 |
transcription, result = transcribe_audio(audio_file)
|
136 |
return transcription, result
|
137 |
-
|
138 |
transcribe_button.click(
|
139 |
fn=transcribe,
|
140 |
inputs=[audio_input],
|
141 |
outputs=[transcription_output, result_output]
|
142 |
)
|
143 |
|
144 |
-
|
145 |
-
with gr.TabItem("
|
146 |
-
summary_output = gr.Textbox(label="Summary |
|
147 |
summarize_button = gr.Button("Summarize")
|
148 |
|
149 |
def summarize(transcription):
|
150 |
if not transcription:
|
151 |
return "Warning: a transcription must be available."
|
152 |
-
summary =
|
153 |
return summary
|
154 |
|
155 |
summarize_button.click(
|
@@ -158,7 +251,40 @@ with iface:
|
|
158 |
outputs=summary_output
|
159 |
)
|
160 |
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
with gr.TabItem("Download PDF"):
|
163 |
pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
|
164 |
pdf_summary_only = gr.Button("Download PDF with Summary Only")
|
@@ -195,6 +321,4 @@ with iface:
|
|
195 |
outputs=[pdf_output_both]
|
196 |
)
|
197 |
|
198 |
-
|
199 |
-
# run
|
200 |
-
iface.launch(share=True, debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import time
|
3 |
import os
|
4 |
+
import warnings
|
|
|
|
|
5 |
warnings.filterwarnings("ignore")
|
6 |
+
from pydub import AudioSegment
|
7 |
|
8 |
+
# If m4a audio, convert to wav (Python)
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
def convert_to_wav(audio_file):
|
10 |
audio = AudioSegment.from_file(audio_file, format="m4a")
|
11 |
wav_file = "temp.wav"
|
12 |
audio.export(wav_file, format="wav")
|
13 |
return wav_file
|
14 |
|
|
|
15 |
|
16 |
|
17 |
+
import torch
|
18 |
+
from transformers import pipeline # AutoProcessor, AutoModelForSpeechSeq2Seq
|
19 |
+
|
20 |
+
|
21 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
22 |
+
torch_dtype = torch.float32
|
23 |
+
|
24 |
+
# ASR pipeline
|
25 |
+
asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype)
|
26 |
+
|
27 |
+
# ASR
|
28 |
def transcribe_audio(audio_file):
|
29 |
if audio_file.endswith(".m4a"):
|
30 |
audio_file = convert_to_wav(audio_file)
|
31 |
|
32 |
start_time = time.time()
|
33 |
|
34 |
+
|
35 |
with torch.no_grad():
|
36 |
+
output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8}) # "task": "transcribe", "language": "no" + ..batch_size=10 ?
|
37 |
|
38 |
transcription = output["text"]
|
39 |
end_time = time.time()
|
|
|
41 |
output_time = end_time - start_time
|
42 |
word_count = len(transcription.split())
|
43 |
|
44 |
+
result = f"Time taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
|
45 |
|
46 |
return transcription.strip(), result
|
47 |
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
# -------------------------------------[VERSION 3: full-on t5-base + NLTK + 3 styles for summarization]-------------------------------------
|
56 |
+
import nltk
|
57 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
58 |
+
from nltk.corpus import stopwords
|
59 |
+
import networkx as nx
|
60 |
+
from sklearn.feature_extraction.text import TfidfVectorizer # from before
|
61 |
+
from sklearn.metrics.pairwise import cosine_similarity # from before
|
62 |
+
import pandas as pd
|
63 |
+
import numpy as np
|
64 |
+
from rouge_score import rouge_scorer
|
65 |
+
|
66 |
+
nltk.download('punkt') # from before
|
67 |
+
nltk.download('stopwords')
|
68 |
+
|
69 |
+
|
70 |
+
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
|
71 |
+
|
72 |
+
# Clean text using Norwegian-specific replacements
|
73 |
+
def clean_text(text):
|
74 |
+
text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
|
75 |
+
text = re.sub(r'\<a href', ' ', str(text))
|
76 |
+
text = re.sub(r'&', '', str(text))
|
77 |
+
text = re.sub(r'\(s+', '(', str(text))
|
78 |
+
text = re.sub(r's+\)', ')', str(text))
|
79 |
+
text = re.sub(r'\(\)', '', str(text))
|
80 |
+
text = re.sub(r'\s+', ' ', str(text))
|
81 |
+
text = re.sub(r'[_"\-;%|+&=*%!?:#$@\[\]]', ' ', str(text))
|
82 |
+
text = re.sub(r'<br />', ' ', str(text))
|
83 |
+
text = re.sub(r'\'', '', str(text))
|
84 |
+
text = re.sub(r'«', '', str(text))
|
85 |
+
text = re.sub(r'»', '', str(text))
|
86 |
+
text = re.sub(r'–', '-', str(text))
|
87 |
+
text = re.sub(r'…', '.', str(text))
|
88 |
+
text = re.sub(r'[^\x00-\x7F]+', ' ', str(text))
|
89 |
+
return text
|
90 |
+
|
91 |
+
def preprocess_text(text):
|
92 |
+
try:
|
93 |
+
words = word_tokenize(text)
|
94 |
+
stop_words = set(stopwords.words('norwegian'))
|
95 |
+
words_without_stopwords = [word for word in words if word.lower() not in stop_words]
|
96 |
+
processed_text = ' '.join(words_without_stopwords)
|
97 |
+
return processed_text
|
98 |
+
except Exception as e:
|
99 |
+
st.error(f"Error during text preprocessing: {e}")
|
100 |
+
return None
|
101 |
+
# ROUGE metric
|
102 |
+
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
|
103 |
+
|
104 |
+
|
105 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
106 |
+
|
107 |
+
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16)
|
108 |
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
|
109 |
+
|
|
|
110 |
summarization_model.to(device)
|
111 |
|
|
|
112 |
def summarize_text(text):
|
113 |
+
preprocessed_text = preprocess_text(text)
|
114 |
+
if preprocessed_text is None:
|
115 |
+
return None
|
116 |
+
inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
|
117 |
+
inputs = inputs.to(device)
|
118 |
+
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
|
119 |
+
summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
120 |
+
return summary
|
121 |
|
122 |
+
def build_similarity_matrix(sentences, stop_words):
|
123 |
+
similarity_matrix = nx.Graph()
|
124 |
+
for i, tokens_a in enumerate(sentences):
|
125 |
+
for j, tokens_b in enumerate(sentences):
|
126 |
+
if i != j:
|
127 |
+
common_words = set(tokens_a) & set(tokens_b)
|
128 |
+
similarity_matrix.add_edge(i, j, weight=len(common_words))
|
129 |
+
return similarity_matrix
|
130 |
+
|
131 |
+
|
132 |
+
def graph_based_summary(text, num_paragraphs=3):
|
133 |
+
sentences = text.strip().split(".")
|
134 |
+
if len(sentences) < num_paragraphs:
|
135 |
+
return sentences
|
136 |
+
sentence_tokens = [word_tokenize(sent) for sent in sentences]
|
137 |
+
stop_words = set(stopwords.words('norwegian'))
|
138 |
+
filtered_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in sentence_tokens]
|
139 |
+
similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)
|
140 |
+
scores = nx.pagerank(similarity_matrix)
|
141 |
+
ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
|
142 |
+
summary = [sent for _, sent in ranked_sentences[:num_paragraphs]]
|
143 |
+
return summary
|
144 |
|
145 |
+
def lex_rank_summary(text, num_paragraphs=3):
|
146 |
+
sentences = nltk.sent_tokenize(text)
|
147 |
+
if len(sentences) < num_paragraphs:
|
148 |
+
return sentences
|
149 |
+
stop_words = set(stopwords.words('norwegian'))
|
150 |
+
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
151 |
+
X = vectorizer.fit_transform(sentences)
|
152 |
+
similarity_matrix = cosine_similarity(X, X)
|
153 |
nx_graph = nx.from_numpy_array(similarity_matrix)
|
154 |
+
scores = nx.pagerank_numpy(nx_graph)
|
155 |
+
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
|
156 |
+
summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
|
157 |
+
return summary
|
158 |
|
159 |
+
def text_rank_summary(text, num_paragraphs=3):
|
160 |
+
sentences = nltk.sent_tokenize(text)
|
161 |
+
if len(sentences) < num_paragraphs:
|
162 |
+
return sentences
|
163 |
+
stop_words = set(stopwords.words('norwegian'))
|
164 |
+
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
165 |
+
X = vectorizer.fit_transform(sentences)
|
166 |
+
similarity_matrix = cosine_similarity(X, X)
|
167 |
+
nx_graph = nx.from_numpy_array(similarity_matrix)
|
168 |
+
scores = nx.pagerank(nx_graph)
|
169 |
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
|
170 |
+
summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
|
|
|
|
|
171 |
return summary
|
172 |
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
+
import gradio as gr
|
176 |
+
from fpdf import FPDF
|
177 |
+
from PIL import Image
|
178 |
+
|
179 |
def save_to_pdf(transcription, summary):
|
180 |
pdf = FPDF()
|
181 |
pdf.add_page()
|
|
|
190 |
if summary:
|
191 |
pdf.multi_cell(0, 10, "Summary:\n" + summary)
|
192 |
|
193 |
+
pdf_output_path = "transcription.pdf"
|
194 |
pdf.output(pdf_output_path)
|
195 |
return pdf_output_path
|
196 |
|
197 |
+
|
198 |
+
banner_html = """
|
199 |
+
<div style="text-align: center;">
|
200 |
+
<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/picture.png" alt="Banner" width="100%" height="auto">
|
201 |
+
</div>
|
202 |
+
"""
|
203 |
+
|
204 |
iface = gr.Interface(
|
205 |
fn=transcribe_audio,
|
206 |
inputs=gr.Audio(type="filepath"),
|
207 |
outputs="text",
|
208 |
+
title="SW Transcription App",
|
209 |
description="Upload an audio file to get the transcription",
|
210 |
theme="default",
|
211 |
live=False
|
212 |
)
|
213 |
+
|
214 |
iface = gr.Blocks()
|
215 |
|
216 |
with iface:
|
217 |
gr.HTML(banner_html)
|
218 |
+
gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift :D")
|
219 |
|
220 |
with gr.Tabs():
|
221 |
+
|
222 |
with gr.TabItem("Transcription"):
|
223 |
audio_input = gr.Audio(type="filepath")
|
224 |
transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
|
|
|
228 |
def transcribe(audio_file):
|
229 |
transcription, result = transcribe_audio(audio_file)
|
230 |
return transcription, result
|
|
|
231 |
transcribe_button.click(
|
232 |
fn=transcribe,
|
233 |
inputs=[audio_input],
|
234 |
outputs=[transcription_output, result_output]
|
235 |
)
|
236 |
|
237 |
+
|
238 |
+
with gr.TabItem("Summary_t1"):
|
239 |
+
summary_output = gr.Textbox(label="Summary | Graph-based")
|
240 |
summarize_button = gr.Button("Summarize")
|
241 |
|
242 |
def summarize(transcription):
|
243 |
if not transcription:
|
244 |
return "Warning: a transcription must be available."
|
245 |
+
summary = graph_based_summary(transcription)
|
246 |
return summary
|
247 |
|
248 |
summarize_button.click(
|
|
|
251 |
outputs=summary_output
|
252 |
)
|
253 |
|
254 |
+
with gr.TabItem("Summary_t2"):
|
255 |
+
summary_output = gr.Textbox(label="Summary | LexRank")
|
256 |
+
summarize_button = gr.Button("Summarize")
|
257 |
+
|
258 |
+
def summarize(transcription):
|
259 |
+
if not transcription:
|
260 |
+
return "Warning: a transcription must be available."
|
261 |
+
summary = lex_rank_summary(transcription)
|
262 |
+
return summary
|
263 |
+
|
264 |
+
summarize_button.click(
|
265 |
+
fn=summarize,
|
266 |
+
inputs=[transcription_output],
|
267 |
+
outputs=summary_output
|
268 |
+
)
|
269 |
+
|
270 |
+
|
271 |
+
with gr.TabItem("Summary_t3"):
|
272 |
+
summary_output = gr.Textbox(label="Summary | TextRank")
|
273 |
+
summarize_button = gr.Button("Summarize")
|
274 |
+
|
275 |
+
def summarize(transcription):
|
276 |
+
if not transcription:
|
277 |
+
return "Warning: a transcription must be available."
|
278 |
+
summary = text_rank_summary(transcription)
|
279 |
+
return summary
|
280 |
+
|
281 |
+
summarize_button.click(
|
282 |
+
fn=summarize,
|
283 |
+
inputs=[transcription_output],
|
284 |
+
outputs=summary_output
|
285 |
+
)
|
286 |
+
|
287 |
+
|
288 |
with gr.TabItem("Download PDF"):
|
289 |
pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
|
290 |
pdf_summary_only = gr.Button("Download PDF with Summary Only")
|
|
|
321 |
outputs=[pdf_output_both]
|
322 |
)
|
323 |
|
324 |
+
iface.launch(share=True, debug=True)
|
|
|
|