Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -54,7 +54,12 @@ SIDEBAR_INFO = f"""
|
|
54 |
<img src="{LOGO}" style="width: 100%; height: auto;"/>
|
55 |
</div>
|
56 |
"""
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
def convert_to_wav(filepath):
|
60 |
_,file_ending = os.path.splitext(f'{filepath}')
|
@@ -134,12 +139,18 @@ def build_similarity_matrix(sentences, stop_words):
|
|
134 |
similarity_matrix.add_edge(i, j, weight=len(common_words))
|
135 |
return similarity_matrix
|
136 |
|
137 |
-
#
|
138 |
def graph_based_summary(text, num_paragraphs=3):
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
doc = nlp(text)
|
140 |
sentences = [sent.text for sent in doc.sents]
|
141 |
if len(sentences) < num_paragraphs:
|
142 |
-
return sentences
|
143 |
|
144 |
sentence_tokens = [nlp(sent) for sent in sentences]
|
145 |
stop_words = spacy_stop_words
|
@@ -152,10 +163,16 @@ def graph_based_summary(text, num_paragraphs=3):
|
|
152 |
|
153 |
# LexRank
|
154 |
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
doc = nlp(text)
|
156 |
sentences = [sent.text for sent in doc.sents]
|
157 |
if len(sentences) < num_paragraphs:
|
158 |
-
return sentences
|
159 |
|
160 |
stop_words = spacy_stop_words
|
161 |
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
@@ -171,10 +188,18 @@ def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
|
|
171 |
|
172 |
# TextRank
|
173 |
def text_rank_summary(text, num_paragraphs=3):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
doc = nlp(text)
|
175 |
sentences = [sent.text for sent in doc.sents]
|
176 |
if len(sentences) < num_paragraphs:
|
177 |
-
return sentences
|
178 |
|
179 |
stop_words = spacy_stop_words
|
180 |
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
@@ -213,31 +238,60 @@ with iface:
|
|
213 |
gr.Markdown(HEADER_INFO)
|
214 |
|
215 |
with gr.Tabs():
|
216 |
-
with gr.TabItem("Transcription"):
|
217 |
-
audio_input = gr.Audio(type="filepath")
|
218 |
-
text_output = gr.Textbox(label="Text")
|
219 |
-
result_output = gr.Textbox(label="Transcription Details")
|
220 |
-
transcribe_button = gr.Button("Transcribe")
|
221 |
-
|
222 |
-
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
|
223 |
|
224 |
-
with gr.TabItem("Summary |
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
with gr.TabItem("Summary | LexRank"):
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
with gr.TabItem("Summary | TextRank"):
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
with gr.TabItem("Download PDF"):
|
243 |
pdf_text_only = gr.Button("Download PDF with Text Only")
|
@@ -247,7 +301,10 @@ with iface:
|
|
247 |
pdf_output = gr.File(label="Download PDF")
|
248 |
|
249 |
pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
|
250 |
-
pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[
|
251 |
-
pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output,
|
|
|
|
|
|
|
252 |
|
253 |
-
iface.launch(share=True, debug=True)
|
|
|
54 |
<img src="{LOGO}" style="width: 100%; height: auto;"/>
|
55 |
</div>
|
56 |
"""
|
57 |
+
# Check if CUDA is available
|
58 |
+
if not torch.cuda.is_available():
|
59 |
+
raise RuntimeError("CUDA not available. Go look for a GPU.")
|
60 |
+
|
61 |
+
# Set GPU
|
62 |
+
device = "cuda"
|
63 |
|
64 |
def convert_to_wav(filepath):
|
65 |
_,file_ending = os.path.splitext(f'{filepath}')
|
|
|
139 |
similarity_matrix.add_edge(i, j, weight=len(common_words))
|
140 |
return similarity_matrix
|
141 |
|
142 |
+
# PageRank
|
143 |
def graph_based_summary(text, num_paragraphs=3):
|
144 |
+
"""
|
145 |
+
1. Constructs a token-based similarity matrix where sentences are nodes, with edge weights representing token overlap.
|
146 |
+
Applies PageRank to determine sentence importance, ranking nodes by their centrality in the graph.
|
147 |
+
Optimized for scenarios with dense inter-sentence relationships.
|
148 |
+
2. Bjørn: gir sammendrag av det som er mest relevant i teksten.
|
149 |
+
"""
|
150 |
doc = nlp(text)
|
151 |
sentences = [sent.text for sent in doc.sents]
|
152 |
if len(sentences) < num_paragraphs:
|
153 |
+
return ' '.join(sentences) # Adjusted to return a single string
|
154 |
|
155 |
sentence_tokens = [nlp(sent) for sent in sentences]
|
156 |
stop_words = spacy_stop_words
|
|
|
163 |
|
164 |
# LexRank
|
165 |
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
|
166 |
+
"""
|
167 |
+
1. Uses TF-IDF vectorization to compute cosine similarity, graph with sentences as nodes. PageRank estimate sentences
|
168 |
+
by their eigenvector centrality, identifying the most representative ones. Good for sparse graph structures with thresholding.
|
169 |
+
2. Bjørn: gir et sammendrag som best representerer hele teksten.
|
170 |
+
|
171 |
+
"""
|
172 |
doc = nlp(text)
|
173 |
sentences = [sent.text for sent in doc.sents]
|
174 |
if len(sentences) < num_paragraphs:
|
175 |
+
return ' '.join(sentences) # Adjusted to return a single string
|
176 |
|
177 |
stop_words = spacy_stop_words
|
178 |
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
|
|
188 |
|
189 |
# TextRank
|
190 |
def text_rank_summary(text, num_paragraphs=3):
|
191 |
+
"""
|
192 |
+
1. Constructs a graph with sentences as nodes (does not use TF-IDF), connected by weighted edges based on lexical similarity
|
193 |
+
(counts the number of shared words between two sentences and dividing by the total number of words to get a similarity score),
|
194 |
+
|
195 |
+
|
196 |
+
Suitable for general-purpose text summarization.
|
197 |
+
2. Bjørn: gir et sammendrag som fanger opp de viktigste delene av teksten.
|
198 |
+
"""
|
199 |
doc = nlp(text)
|
200 |
sentences = [sent.text for sent in doc.sents]
|
201 |
if len(sentences) < num_paragraphs:
|
202 |
+
return ' '.join(sentences) # Adjusted to return a single string
|
203 |
|
204 |
stop_words = spacy_stop_words
|
205 |
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
|
|
238 |
gr.Markdown(HEADER_INFO)
|
239 |
|
240 |
with gr.Tabs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
+
with gr.TabItem("Summary | PageRank"):
|
243 |
+
text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
|
244 |
+
summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
|
245 |
+
|
246 |
+
# Displaying AiLab and Bjørn explanations
|
247 |
+
gr.Markdown("""
|
248 |
+
**token-based**: similarity matrix edge weights representing token overlap/
|
249 |
+
ranked by their centrality in the graph (good with dense inter-sentence relationships)
|
250 |
+
""")
|
251 |
+
gr.Markdown("""
|
252 |
+
*Bjørn*: **gir sammendrag som fanger opp de mest relevante setninger i teksten**
|
253 |
+
""")
|
254 |
+
|
255 |
+
summarize_transcribed_button_graph = gr.Button("Summary of Transcribed Text, Click Here")
|
256 |
+
summarize_transcribed_button_graph.click(fn=lambda text: graph_based_summary(text), inputs=[text_output], outputs=[summary_output_graph])
|
257 |
+
summarize_uploaded_button_graph = gr.Button("Upload Text to Summarize, Click Here")
|
258 |
+
summarize_uploaded_button_graph.click(fn=graph_based_summary, inputs=[text_input_graph], outputs=[summary_output_graph])
|
259 |
|
260 |
with gr.TabItem("Summary | LexRank"):
|
261 |
+
text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
|
262 |
+
summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
|
263 |
+
|
264 |
+
# Displaying AiLab and Bjørn explanations
|
265 |
+
gr.Markdown("""
|
266 |
+
**semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
|
267 |
+
(good for sparse graph structures with thresholding)
|
268 |
+
""")
|
269 |
+
gr.Markdown("""
|
270 |
+
*Bjørn*: **gir sammendrag som best fanger opp betydningen av hele teksten**
|
271 |
+
""")
|
272 |
+
|
273 |
+
summarize_transcribed_button_lex = gr.Button("Summary of Transcribed Text, Click Here")
|
274 |
+
summarize_transcribed_button_lex.click(fn=lambda text: lex_rank_summary(text), inputs=[text_output], outputs=[summary_output_lex])
|
275 |
+
summarize_uploaded_button_lex = gr.Button("Upload Text to Summarize, Click Here")
|
276 |
+
summarize_uploaded_button_lex.click(fn=lex_rank_summary, inputs=[text_input_lex], outputs=[summary_output_lex])
|
277 |
|
278 |
with gr.TabItem("Summary | TextRank"):
|
279 |
+
text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
|
280 |
+
summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
|
281 |
+
|
282 |
+
# Displaying AiLab and Bjørn explanations
|
283 |
+
gr.Markdown("""
|
284 |
+
**sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
|
285 |
+
""")
|
286 |
+
gr.Markdown("""
|
287 |
+
*Bjørn*: **sammendrag basert på i de setningene som ligner mest på hverandre fra teksten**
|
288 |
+
|
289 |
+
""")
|
290 |
+
|
291 |
+
summarize_transcribed_button_text_rank = gr.Button("Summary of Transcribed Text, Click Here")
|
292 |
+
summarize_transcribed_button_text_rank.click(fn=lambda text: text_rank_summary(text), inputs=[text_output], outputs=[summary_output_text_rank])
|
293 |
+
summarize_uploaded_button_text_rank = gr.Button("Upload Text to Summarize, Click Here")
|
294 |
+
summarize_uploaded_button_text_rank.click(fn=text_rank_summary, inputs=[text_input_text_rank], outputs=[summary_output_text_rank])
|
295 |
|
296 |
with gr.TabItem("Download PDF"):
|
297 |
pdf_text_only = gr.Button("Download PDF with Text Only")
|
|
|
301 |
pdf_output = gr.File(label="Download PDF")
|
302 |
|
303 |
pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
|
304 |
+
pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output]) # Includes all summary outputs
|
305 |
+
pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output_graph], outputs=[pdf_output]) # Defaulting to Graph-based summary
|
306 |
+
|
307 |
+
|
308 |
+
|
309 |
|
310 |
+
iface.launch(share=True, debug=True)
|