camparchimedes commited on
Commit
9e722fb
·
verified ·
1 Parent(s): 64e12f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -62
app.py CHANGED
@@ -1,49 +1,39 @@
1
- import networkx as nx
2
- from sklearn.feature_extraction.text import TfidfVectorizer
3
- from sklearn.metrics.pairwise import cosine_similarity
4
- import nltk
5
- nltk.download('punkt')
6
- from nltk.tokenize import sent_tokenize
7
 
8
- import gradio as gr
9
- import warnings
10
- import torch
11
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
12
- from pydub import AudioSegment
13
- from fpdf import FPDF
14
- from PIL import Image
15
  import time
16
  import os
17
- # import spaces
18
-
19
-
20
  warnings.filterwarnings("ignore")
 
21
 
22
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23
- torch_dtype = torch.float32
24
-
25
- # ASR pipeline
26
- asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch_dtype)
27
-
28
- # Switch m4a to wav
29
  def convert_to_wav(audio_file):
30
  audio = AudioSegment.from_file(audio_file, format="m4a")
31
  wav_file = "temp.wav"
32
  audio.export(wav_file, format="wav")
33
  return wav_file
34
 
35
- # @spaces.GPU(queue=True)
36
 
37
 
38
- # Transcription funct.@ASR
 
 
 
 
 
 
 
 
 
 
39
  def transcribe_audio(audio_file):
40
  if audio_file.endswith(".m4a"):
41
  audio_file = convert_to_wav(audio_file)
42
 
43
  start_time = time.time()
44
 
 
45
  with torch.no_grad():
46
- output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"})
47
 
48
  transcription = output["text"]
49
  end_time = time.time()
@@ -51,44 +41,141 @@ def transcribe_audio(audio_file):
51
  output_time = end_time - start_time
52
  word_count = len(transcription.split())
53
 
54
- result = f"Transcription: {transcription.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
55
 
56
  return transcription.strip(), result
57
 
58
- # t5-base model@summary funct.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
60
- summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
61
- # t5-base to device
62
  summarization_model.to(device)
63
 
64
- # Graph-based summarization (TextRank, method)
65
  def summarize_text(text):
66
- sentences = sent_tokenize(text)
67
- if len(sentences) == 0:
68
- return ""
 
 
 
 
 
69
 
70
- tfidf_vectorizer = TfidfVectorizer()
71
- tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
72
- similarity_matrix = cosine_similarity(tfidf_matrix)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
 
 
 
 
 
 
 
 
74
  nx_graph = nx.from_numpy_array(similarity_matrix)
75
- scores = nx.pagerank(nx_graph)
 
 
 
76
 
 
 
 
 
 
 
 
 
 
 
77
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
78
- # Select top N sentences (e.g., 3 sentences for the summary)
79
- top_n = 3
80
- summary = " ".join([s for _, s in ranked_sentences[:top_n]])
81
  return summary
82
 
83
 
84
- # HTML syntax for imagery
85
- banner_html = """
86
- <div style="text-align: center;">
87
- <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
88
- </div>
89
- """
90
 
91
- # Transcription and summary@PDF option(s)
 
 
 
92
  def save_to_pdf(transcription, summary):
93
  pdf = FPDF()
94
  pdf.add_page()
@@ -103,28 +190,35 @@ def save_to_pdf(transcription, summary):
103
  if summary:
104
  pdf.multi_cell(0, 10, "Summary:\n" + summary)
105
 
106
- pdf_output_path = "transcription_summary.pdf"
107
  pdf.output(pdf_output_path)
108
  return pdf_output_path
109
 
110
- # Gradio
 
 
 
 
 
 
111
  iface = gr.Interface(
112
  fn=transcribe_audio,
113
  inputs=gr.Audio(type="filepath"),
114
  outputs="text",
115
- title="Audio Transcription App",
116
  description="Upload an audio file to get the transcription",
117
  theme="default",
118
  live=False
119
  )
 
120
  iface = gr.Blocks()
121
 
122
  with iface:
123
  gr.HTML(banner_html)
124
- gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
125
 
126
  with gr.Tabs():
127
- # Transcription Tab
128
  with gr.TabItem("Transcription"):
129
  audio_input = gr.Audio(type="filepath")
130
  transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
@@ -134,22 +228,21 @@ with iface:
134
  def transcribe(audio_file):
135
  transcription, result = transcribe_audio(audio_file)
136
  return transcription, result
137
-
138
  transcribe_button.click(
139
  fn=transcribe,
140
  inputs=[audio_input],
141
  outputs=[transcription_output, result_output]
142
  )
143
 
144
- # Summary Tab
145
- with gr.TabItem("Summary"):
146
- summary_output = gr.Textbox(label="Summary | TextRank, graph-based")
147
  summarize_button = gr.Button("Summarize")
148
 
149
  def summarize(transcription):
150
  if not transcription:
151
  return "Warning: a transcription must be available."
152
- summary = summarize_text(transcription)
153
  return summary
154
 
155
  summarize_button.click(
@@ -158,7 +251,40 @@ with iface:
158
  outputs=summary_output
159
  )
160
 
161
- # PDF Download Tab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  with gr.TabItem("Download PDF"):
163
  pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
164
  pdf_summary_only = gr.Button("Download PDF with Summary Only")
@@ -195,6 +321,4 @@ with iface:
195
  outputs=[pdf_output_both]
196
  )
197
 
198
-
199
- # run
200
- iface.launch(share=True, debug=True)
 
 
 
 
 
 
 
1
 
 
 
 
 
 
 
 
2
  import time
3
  import os
4
+ import warnings
 
 
5
  warnings.filterwarnings("ignore")
6
+ from pydub import AudioSegment
7
 
8
+ # If m4a audio, convert to wav (Python)
 
 
 
 
 
 
9
  def convert_to_wav(audio_file):
10
  audio = AudioSegment.from_file(audio_file, format="m4a")
11
  wav_file = "temp.wav"
12
  audio.export(wav_file, format="wav")
13
  return wav_file
14
 
 
15
 
16
 
17
+ import torch
18
+ from transformers import pipeline # AutoProcessor, AutoModelForSpeechSeq2Seq
19
+
20
+
21
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
22
+ torch_dtype = torch.float32
23
+
24
+ # ASR pipeline
25
+ asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype)
26
+
27
+ # ASR
28
  def transcribe_audio(audio_file):
29
  if audio_file.endswith(".m4a"):
30
  audio_file = convert_to_wav(audio_file)
31
 
32
  start_time = time.time()
33
 
34
+
35
  with torch.no_grad():
36
+ output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8}) # "task": "transcribe", "language": "no" + ..batch_size=10 ?
37
 
38
  transcription = output["text"]
39
  end_time = time.time()
 
41
  output_time = end_time - start_time
42
  word_count = len(transcription.split())
43
 
44
+ result = f"Time taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
45
 
46
  return transcription.strip(), result
47
 
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+ # -------------------------------------[VERSION 3: full-on t5-base + NLTK + 3 styles for summarization]-------------------------------------
56
+ import nltk
57
+ from nltk.tokenize import word_tokenize, sent_tokenize
58
+ from nltk.corpus import stopwords
59
+ import networkx as nx
60
+ from sklearn.feature_extraction.text import TfidfVectorizer # from before
61
+ from sklearn.metrics.pairwise import cosine_similarity # from before
62
+ import pandas as pd
63
+ import numpy as np
64
+ from rouge_score import rouge_scorer
65
+
66
+ nltk.download('punkt') # from before
67
+ nltk.download('stopwords')
68
+
69
+
70
+ WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
71
+
72
+ # Clean text using Norwegian-specific replacements
73
+ def clean_text(text):
74
+ text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
75
+ text = re.sub(r'\<a href', ' ', str(text))
76
+ text = re.sub(r'&amp;', '', str(text))
77
+ text = re.sub(r'\(s+', '(', str(text))
78
+ text = re.sub(r's+\)', ')', str(text))
79
+ text = re.sub(r'\(\)', '', str(text))
80
+ text = re.sub(r'\s+', ' ', str(text))
81
+ text = re.sub(r'[_"\-;%|+&=*%!?:#$@\[\]]', ' ', str(text))
82
+ text = re.sub(r'<br />', ' ', str(text))
83
+ text = re.sub(r'\'', '', str(text))
84
+ text = re.sub(r'«', '', str(text))
85
+ text = re.sub(r'»', '', str(text))
86
+ text = re.sub(r'–', '-', str(text))
87
+ text = re.sub(r'…', '.', str(text))
88
+ text = re.sub(r'[^\x00-\x7F]+', ' ', str(text))
89
+ return text
90
+
91
+ def preprocess_text(text):
92
+ try:
93
+ words = word_tokenize(text)
94
+ stop_words = set(stopwords.words('norwegian'))
95
+ words_without_stopwords = [word for word in words if word.lower() not in stop_words]
96
+ processed_text = ' '.join(words_without_stopwords)
97
+ return processed_text
98
+ except Exception as e:
99
+ st.error(f"Error during text preprocessing: {e}")
100
+ return None
101
+ # ROUGE metric
102
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
103
+
104
+
105
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
106
+
107
+ summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16)
108
  summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
109
+
 
110
  summarization_model.to(device)
111
 
 
112
  def summarize_text(text):
113
+ preprocessed_text = preprocess_text(text)
114
+ if preprocessed_text is None:
115
+ return None
116
+ inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
117
+ inputs = inputs.to(device)
118
+ summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
119
+ summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
120
+ return summary
121
 
122
+ def build_similarity_matrix(sentences, stop_words):
123
+ similarity_matrix = nx.Graph()
124
+ for i, tokens_a in enumerate(sentences):
125
+ for j, tokens_b in enumerate(sentences):
126
+ if i != j:
127
+ common_words = set(tokens_a) & set(tokens_b)
128
+ similarity_matrix.add_edge(i, j, weight=len(common_words))
129
+ return similarity_matrix
130
+
131
+
132
+ def graph_based_summary(text, num_paragraphs=3):
133
+ sentences = text.strip().split(".")
134
+ if len(sentences) < num_paragraphs:
135
+ return sentences
136
+ sentence_tokens = [word_tokenize(sent) for sent in sentences]
137
+ stop_words = set(stopwords.words('norwegian'))
138
+ filtered_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in sentence_tokens]
139
+ similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)
140
+ scores = nx.pagerank(similarity_matrix)
141
+ ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
142
+ summary = [sent for _, sent in ranked_sentences[:num_paragraphs]]
143
+ return summary
144
 
145
+ def lex_rank_summary(text, num_paragraphs=3):
146
+ sentences = nltk.sent_tokenize(text)
147
+ if len(sentences) < num_paragraphs:
148
+ return sentences
149
+ stop_words = set(stopwords.words('norwegian'))
150
+ vectorizer = TfidfVectorizer(stop_words=list(stop_words))
151
+ X = vectorizer.fit_transform(sentences)
152
+ similarity_matrix = cosine_similarity(X, X)
153
  nx_graph = nx.from_numpy_array(similarity_matrix)
154
+ scores = nx.pagerank_numpy(nx_graph)
155
+ ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
156
+ summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
157
+ return summary
158
 
159
+ def text_rank_summary(text, num_paragraphs=3):
160
+ sentences = nltk.sent_tokenize(text)
161
+ if len(sentences) < num_paragraphs:
162
+ return sentences
163
+ stop_words = set(stopwords.words('norwegian'))
164
+ vectorizer = TfidfVectorizer(stop_words=list(stop_words))
165
+ X = vectorizer.fit_transform(sentences)
166
+ similarity_matrix = cosine_similarity(X, X)
167
+ nx_graph = nx.from_numpy_array(similarity_matrix)
168
+ scores = nx.pagerank(nx_graph)
169
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
170
+ summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
 
 
171
  return summary
172
 
173
 
 
 
 
 
 
 
174
 
175
+ import gradio as gr
176
+ from fpdf import FPDF
177
+ from PIL import Image
178
+
179
  def save_to_pdf(transcription, summary):
180
  pdf = FPDF()
181
  pdf.add_page()
 
190
  if summary:
191
  pdf.multi_cell(0, 10, "Summary:\n" + summary)
192
 
193
+ pdf_output_path = "transcription.pdf"
194
  pdf.output(pdf_output_path)
195
  return pdf_output_path
196
 
197
+
198
+ banner_html = """
199
+ <div style="text-align: center;">
200
+ <img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/picture.png" alt="Banner" width="100%" height="auto">
201
+ </div>
202
+ """
203
+
204
  iface = gr.Interface(
205
  fn=transcribe_audio,
206
  inputs=gr.Audio(type="filepath"),
207
  outputs="text",
208
+ title="SW Transcription App",
209
  description="Upload an audio file to get the transcription",
210
  theme="default",
211
  live=False
212
  )
213
+
214
  iface = gr.Blocks()
215
 
216
  with iface:
217
  gr.HTML(banner_html)
218
+ gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift :D")
219
 
220
  with gr.Tabs():
221
+
222
  with gr.TabItem("Transcription"):
223
  audio_input = gr.Audio(type="filepath")
224
  transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
 
228
  def transcribe(audio_file):
229
  transcription, result = transcribe_audio(audio_file)
230
  return transcription, result
 
231
  transcribe_button.click(
232
  fn=transcribe,
233
  inputs=[audio_input],
234
  outputs=[transcription_output, result_output]
235
  )
236
 
237
+
238
+ with gr.TabItem("Summary_t1"):
239
+ summary_output = gr.Textbox(label="Summary | Graph-based")
240
  summarize_button = gr.Button("Summarize")
241
 
242
  def summarize(transcription):
243
  if not transcription:
244
  return "Warning: a transcription must be available."
245
+ summary = graph_based_summary(transcription)
246
  return summary
247
 
248
  summarize_button.click(
 
251
  outputs=summary_output
252
  )
253
 
254
+ with gr.TabItem("Summary_t2"):
255
+ summary_output = gr.Textbox(label="Summary | LexRank")
256
+ summarize_button = gr.Button("Summarize")
257
+
258
+ def summarize(transcription):
259
+ if not transcription:
260
+ return "Warning: a transcription must be available."
261
+ summary = lex_rank_summary(transcription)
262
+ return summary
263
+
264
+ summarize_button.click(
265
+ fn=summarize,
266
+ inputs=[transcription_output],
267
+ outputs=summary_output
268
+ )
269
+
270
+
271
+ with gr.TabItem("Summary_t3"):
272
+ summary_output = gr.Textbox(label="Summary | TextRank")
273
+ summarize_button = gr.Button("Summarize")
274
+
275
+ def summarize(transcription):
276
+ if not transcription:
277
+ return "Warning: a transcription must be available."
278
+ summary = text_rank_summary(transcription)
279
+ return summary
280
+
281
+ summarize_button.click(
282
+ fn=summarize,
283
+ inputs=[transcription_output],
284
+ outputs=summary_output
285
+ )
286
+
287
+
288
  with gr.TabItem("Download PDF"):
289
  pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
290
  pdf_summary_only = gr.Button("Download PDF with Summary Only")
 
321
  outputs=[pdf_output_both]
322
  )
323
 
324
+ iface.launch(share=True, debug=True)