Duplicate from pleonova/multi-label-summary-text
Browse filesCo-authored-by: Paula Leonova <[email protected]>
- .github/workflows/file_size.yml +16 -0
- .github/workflows/main.yml +19 -0
- .gitignore +4 -0
- README.md +22 -0
- app.py +366 -0
- app_output/20211215_output_example.pdf +0 -0
- app_output/20211223_output_example.pdf +0 -0
- app_output/20220105_output_example.pdf +0 -0
- example_long_text.txt +18 -0
- examples.json +6 -0
- models.py +96 -0
- requirements.txt +6 -0
- utils.py +85 -0
.github/workflows/file_size.yml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Check file size
|
2 |
+
on: # or directly `on: [push]` to run the action on every push on any branch
|
3 |
+
pull_request:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- name: Check large files
|
14 |
+
uses: ActionsDesk/[email protected]
|
15 |
+
with:
|
16 |
+
filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
|
.github/workflows/main.yml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v2
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
- name: Push to hub
|
17 |
+
env:
|
18 |
+
HF_TOKEN: ${{ secrets.HF_MULTI_LABEL }}
|
19 |
+
run: git push https://pleonova:[email protected]/spaces/pleonova/multi-label-summary-text main
|
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
.DS_Store
|
3 |
+
.gitattributes
|
4 |
+
.idea/
|
README.md
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Multi Label Summary Text
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: gray
|
6 |
+
sdk: streamlit
|
7 |
+
app_file: app.py
|
8 |
+
pinned: false
|
9 |
+
duplicated_from: pleonova/multi-label-summary-text
|
10 |
+
---
|
11 |
+
|
12 |
+
#### Interactive version
|
13 |
+
This app is hosted on HuggingFace spaces: https://huggingface.co/spaces/pleonova/multi-label-summary-text
|
14 |
+
|
15 |
+
#### Objective
|
16 |
+
The goal of this app is to identify multiple relevant labels for long text.
|
17 |
+
|
18 |
+
#### Model
|
19 |
+
facebook/bart-large-mnli zero-shot transfer-learning summarizer and classifier
|
20 |
+
|
21 |
+
#### Approach
|
22 |
+
Updating the head of the neural network, we can use the same pretrained bart model to first summarize our long text by first splitting out our long text into chunks of 1024 tokens and then generating a summary for each of the text chunks. Next, all the summaries are concanenated and the bart model is used classify the summarized text. Alternatively, one can also classify the whole text as is.
|
app.py
ADDED
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from os import write
|
3 |
+
import time
|
4 |
+
import pandas as pd
|
5 |
+
import base64
|
6 |
+
from typing import Sequence
|
7 |
+
import streamlit as st
|
8 |
+
from sklearn.metrics import classification_report
|
9 |
+
|
10 |
+
|
11 |
+
# from models import create_nest_sentences, load_summary_model, summarizer_gen, load_model, classifier_zero
|
12 |
+
import models as md
|
13 |
+
from utils import examples_load, example_long_text_load
|
14 |
+
import json
|
15 |
+
|
16 |
+
ex_text, ex_license, ex_labels, ex_glabels = examples_load()
|
17 |
+
ex_long_text = example_long_text_load()
|
18 |
+
|
19 |
+
|
20 |
+
# if __name__ == '__main__':
|
21 |
+
###################################
|
22 |
+
######## App Description ##########
|
23 |
+
###################################
|
24 |
+
st.markdown("### Long Text Summarization & Multi-Label Classification")
|
25 |
+
st.write("This app summarizes and then classifies your long text(s) with multiple labels using [BART Large CNN](https://huggingface.co/facebook/bart-large-cnn) for the summarization task and [BART Large MNLI](https://huggingface.co/facebook/bart-large-mnli) for the multi-labels matching. The keywords are independently generated using [KeyBERT](https://github.com/MaartenGr/KeyBERT) and not used in any downstream tasks.")
|
26 |
+
st.write("__Inputs__: User enters their own custom text(s) and labels.")
|
27 |
+
st.write("__Outputs__: A summary of the text, likelihood match score for each label and a downloadable csv of the results. \
|
28 |
+
Includes additional options to generate a list of keywords and/or evaluate results against a list of ground truth labels, if available.")
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
###################################
|
33 |
+
######## Example Input ##########
|
34 |
+
###################################
|
35 |
+
example_button = st.button(label='See Example')
|
36 |
+
if example_button:
|
37 |
+
example_text = ex_long_text #ex_text
|
38 |
+
display_text = 'Excerpt from Frankenstein:' + example_text + '"\n\n' + "[This is an excerpt from Project Gutenberg's Frankenstein. " + ex_license + "]"
|
39 |
+
input_labels = ex_labels
|
40 |
+
input_glabels = ex_glabels
|
41 |
+
title_name = 'Frankenstein, Chapter 3'
|
42 |
+
else:
|
43 |
+
display_text = ''
|
44 |
+
input_labels = ''
|
45 |
+
input_glabels = ''
|
46 |
+
title_name = 'Submitted Text'
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
with st.form(key='my_form'):
|
51 |
+
###################################
|
52 |
+
######## Form: Step 1 ##########
|
53 |
+
###################################
|
54 |
+
st.markdown("##### Step 1: Upload Text")
|
55 |
+
text_input = st.text_area("Input any text you want to summarize & classify here (keep in mind very long text will take a while to process):", display_text)
|
56 |
+
|
57 |
+
text_csv_expander = st.expander(label=f'Want to upload multiple texts at once? Expand to upload your text files below.', expanded=False)
|
58 |
+
with text_csv_expander:
|
59 |
+
st.markdown('##### Choose one of the options below:')
|
60 |
+
st.write("__Option A:__")
|
61 |
+
uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
|
62 |
+
accept_multiple_files=True, key = 'text_uploader',
|
63 |
+
type='txt')
|
64 |
+
st.write("__Option B:__")
|
65 |
+
uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with two columns: "title" and "text"',
|
66 |
+
accept_multiple_files=False, key = 'csv_text_uploader',
|
67 |
+
type='csv')
|
68 |
+
|
69 |
+
if text_input == display_text and display_text != '':
|
70 |
+
text_input = example_text
|
71 |
+
|
72 |
+
gen_keywords = st.radio(
|
73 |
+
"Generate keywords from text? (independent from the input labels below)",
|
74 |
+
('Yes', 'No')
|
75 |
+
)
|
76 |
+
|
77 |
+
gen_summary = st.radio(
|
78 |
+
"Generate summary from text? (recommended for label matching below, but will take longer)",
|
79 |
+
('Yes', 'No')
|
80 |
+
)
|
81 |
+
|
82 |
+
###################################
|
83 |
+
######## Form: Step 2 ##########
|
84 |
+
###################################
|
85 |
+
st.write('\n')
|
86 |
+
st.markdown("##### Step 2: Enter Labels")
|
87 |
+
labels = st.text_input('Enter possible topic labels, which can be either keywords and/or general themes (comma-separated):',input_labels, max_chars=2000)
|
88 |
+
labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
|
89 |
+
|
90 |
+
labels_csv_expander = st.expander(label=f'Prefer to upload a list of labels instead? Click here to upload your CSV file.',expanded=False)
|
91 |
+
with labels_csv_expander:
|
92 |
+
uploaded_labels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
93 |
+
key='labels_uploader')
|
94 |
+
|
95 |
+
###################################
|
96 |
+
######## Form: Step 3 ##########
|
97 |
+
###################################
|
98 |
+
st.write('\n')
|
99 |
+
st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
|
100 |
+
glabels = st.text_input('If available, enter ground truth topic labels to evaluate results, otherwise leave blank (comma-separated):',input_glabels, max_chars=2000)
|
101 |
+
glabels = list(set([x.strip() for x in glabels.strip().split(',') if len(x.strip()) > 0]))
|
102 |
+
|
103 |
+
|
104 |
+
glabels_csv_expander = st.expander(label=f'Have a file with labels for the text? Click here to upload your CSV file.', expanded=False)
|
105 |
+
with glabels_csv_expander:
|
106 |
+
st.markdown('##### Choose one of the options below:')
|
107 |
+
st.write("__Option A:__")
|
108 |
+
uploaded_onetext_glabels_file = st.file_uploader("Single Text: Choose a CSV file with one column and no header, where each cell is a separate label",
|
109 |
+
key = 'onetext_glabels_uploader')
|
110 |
+
st.write("__Option B:__")
|
111 |
+
uploaded_multitext_glabels_file = st.file_uploader('Multiple Text: Choose a CSV file with two columns "title" and "label", with the cells in the title column matching the name of the files uploaded in step #1.',
|
112 |
+
key = 'multitext_glabels_uploader')
|
113 |
+
|
114 |
+
|
115 |
+
# threshold_value = st.slider(
|
116 |
+
# 'Select a threshold cutoff for matching percentage (used for ground truth label evaluation)',
|
117 |
+
# 0.0, 1.0, (0.5))
|
118 |
+
|
119 |
+
submit_button = st.form_submit_button(label='Submit')
|
120 |
+
|
121 |
+
st.write("_For improvments/suggestions, please file an issue here: https://github.com/pleonova/multi-label-summary-text_")
|
122 |
+
|
123 |
+
|
124 |
+
###################################
|
125 |
+
####### Model Load Time #########
|
126 |
+
###################################
|
127 |
+
with st.spinner('Loading pretrained models...'):
|
128 |
+
start = time.time()
|
129 |
+
summarizer = md.load_summary_model()
|
130 |
+
s_time = round(time.time() - start,4)
|
131 |
+
|
132 |
+
start = time.time()
|
133 |
+
classifier = md.load_model()
|
134 |
+
c_time = round(time.time() - start,4)
|
135 |
+
|
136 |
+
start = time.time()
|
137 |
+
kw_model = md.load_keyword_model()
|
138 |
+
k_time = round(time.time() - start,4)
|
139 |
+
|
140 |
+
st.spinner(f'Time taken to load various models: {k_time}s for KeyBERT model & {s_time}s for BART summarizer mnli model & {c_time}s for BART classifier mnli model.')
|
141 |
+
# st.success(None)
|
142 |
+
|
143 |
+
|
144 |
+
if submit_button or example_button:
|
145 |
+
###################################
|
146 |
+
######## Load Text Data #######
|
147 |
+
###################################
|
148 |
+
if len(text_input) == 0 and len(uploaded_text_files) == 0 and uploaded_csv_text_files is None:
|
149 |
+
st.error("Enter some text to generate a summary")
|
150 |
+
else:
|
151 |
+
|
152 |
+
if len(text_input) != 0:
|
153 |
+
text_df = pd.DataFrame.from_dict({'title': [title_name], 'text': [text_input]})
|
154 |
+
|
155 |
+
# OPTION A
|
156 |
+
elif len(uploaded_text_files) != 0:
|
157 |
+
st.markdown("### Text Inputs")
|
158 |
+
st.write('Files concatenated into a dataframe:')
|
159 |
+
file_names = []
|
160 |
+
raw_texts = []
|
161 |
+
for uploaded_file in uploaded_text_files:
|
162 |
+
text = str(uploaded_file.read(), "utf-8")
|
163 |
+
raw_texts.append(text)
|
164 |
+
title_file_name = uploaded_file.name.replace('.txt','')
|
165 |
+
file_names.append(title_file_name)
|
166 |
+
text_df = pd.DataFrame({'title': file_names,
|
167 |
+
'text': raw_texts})
|
168 |
+
st.dataframe(text_df.head())
|
169 |
+
st.download_button(
|
170 |
+
label="Download data as CSV",
|
171 |
+
data=text_df.to_csv().encode('utf-8'),
|
172 |
+
file_name='title_text.csv',
|
173 |
+
mime='title_text/csv',
|
174 |
+
)
|
175 |
+
# OPTION B
|
176 |
+
elif uploaded_csv_text_files is not None:
|
177 |
+
text_df = pd.read_csv(uploaded_csv_text_files)
|
178 |
+
|
179 |
+
# Which input was used? If text area was used, ignore the 'title'
|
180 |
+
if len(text_input) != 0:
|
181 |
+
title_element = []
|
182 |
+
else:
|
183 |
+
title_element = ['title']
|
184 |
+
|
185 |
+
|
186 |
+
###################################
|
187 |
+
######## Text Chunks ##########
|
188 |
+
###################################
|
189 |
+
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
190 |
+
# For each body of text, create text chunks of a certain token size required for the transformer
|
191 |
+
|
192 |
+
text_chunks_lib = dict()
|
193 |
+
for i in range(0, len(text_df)):
|
194 |
+
nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
|
195 |
+
|
196 |
+
# For each chunk of sentences (within the token max)
|
197 |
+
text_chunks = []
|
198 |
+
for n in range(0, len(nested_sentences)):
|
199 |
+
tc = " ".join(map(str, nested_sentences[n]))
|
200 |
+
text_chunks.append(tc)
|
201 |
+
title_entry = text_df['title'][i]
|
202 |
+
text_chunks_lib[title_entry] = text_chunks
|
203 |
+
|
204 |
+
|
205 |
+
################################
|
206 |
+
######## Keywords ##########
|
207 |
+
################################
|
208 |
+
if gen_keywords == 'Yes':
|
209 |
+
st.markdown("### Top Keywords")
|
210 |
+
with st.spinner("Generating keywords from text..."):
|
211 |
+
|
212 |
+
kw_dict = dict()
|
213 |
+
text_chunk_counter = 0
|
214 |
+
for key in text_chunks_lib:
|
215 |
+
keywords_list = []
|
216 |
+
for text_chunk in text_chunks_lib[key]:
|
217 |
+
text_chunk_counter += 1
|
218 |
+
keywords_list += md.keyword_gen(kw_model, text_chunk)
|
219 |
+
kw_dict[key] = dict(keywords_list)
|
220 |
+
# Display as a dataframe
|
221 |
+
kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
|
222 |
+
kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
|
223 |
+
kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
|
224 |
+
|
225 |
+
kw_column_list = ['keyword', 'score']
|
226 |
+
kw_df = kw_df[kw_df['score'] > 0.25][title_element + kw_column_list].sort_values(title_element + ['score'], ascending=False).reset_index().drop(columns='index')
|
227 |
+
|
228 |
+
st.dataframe(kw_df)
|
229 |
+
st.download_button(
|
230 |
+
label="Download data as CSV",
|
231 |
+
data=kw_df.to_csv().encode('utf-8'),
|
232 |
+
file_name='title_keywords.csv',
|
233 |
+
mime='title_keywords/csv',
|
234 |
+
)
|
235 |
+
|
236 |
+
|
237 |
+
###################################
|
238 |
+
########## Summarize ##########
|
239 |
+
###################################
|
240 |
+
if gen_summary == 'Yes':
|
241 |
+
st.markdown("### Summary")
|
242 |
+
with st.spinner(f'Generating summaries for {len(text_df)} texts consisting of a total of {text_chunk_counter} chunks (this may take a minute)...'):
|
243 |
+
sum_dict = dict()
|
244 |
+
for i, key in enumerate(text_chunks_lib):
|
245 |
+
with st.expander(label=f'({i+1}/{len(text_df)}) Expand to see intermediate summary generation details for: {key}', expanded=False):
|
246 |
+
# for key in text_chunks_lib:
|
247 |
+
summary = []
|
248 |
+
for num_chunk, text_chunk in enumerate(text_chunks_lib[key]):
|
249 |
+
chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens=400, minimum_tokens=100)
|
250 |
+
summary.append(chunk_summary)
|
251 |
+
|
252 |
+
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
253 |
+
st.markdown(text_chunk)
|
254 |
+
st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
|
255 |
+
st.markdown(chunk_summary)
|
256 |
+
|
257 |
+
# Combine all the summaries into a list and compress into one document, again
|
258 |
+
final_summary = "\n\n".join(list(summary))
|
259 |
+
sum_dict[key] = [final_summary]
|
260 |
+
|
261 |
+
sum_df = pd.DataFrame.from_dict(sum_dict).T.reset_index()
|
262 |
+
sum_df.columns = ['title', 'summary_text']
|
263 |
+
# TO DO: Make sure summary_text does not exceed the token length
|
264 |
+
|
265 |
+
st.dataframe(sum_df)
|
266 |
+
st.download_button(
|
267 |
+
label="Download data as CSV",
|
268 |
+
data=sum_df.to_csv().encode('utf-8'),
|
269 |
+
file_name='title_summary.csv',
|
270 |
+
mime='title_summary/csv',
|
271 |
+
)
|
272 |
+
|
273 |
+
###################################
|
274 |
+
########## Classifier #########
|
275 |
+
###################################
|
276 |
+
if ((len(text_input) == 0 and uploaded_text_files is None and uploaded_csv_text_files is None)
|
277 |
+
or (len(labels) == 0 and uploaded_labels_file is None)):
|
278 |
+
st.error('Enter some text and at least one possible topic to see label predictions.')
|
279 |
+
else:
|
280 |
+
if gen_summary == 'Yes':
|
281 |
+
st.markdown("### Top Label Predictions on Summary vs Full Text")
|
282 |
+
else:
|
283 |
+
st.markdown("### Top Label Predictions on Full Text")
|
284 |
+
|
285 |
+
if uploaded_labels_file is not None:
|
286 |
+
labels_df = pd.read_csv(uploaded_labels_file, header=None)
|
287 |
+
label_list = labels_df.iloc[:, 0]
|
288 |
+
else:
|
289 |
+
label_list = labels
|
290 |
+
|
291 |
+
with st.spinner('Matching labels...(may take some time)'):
|
292 |
+
if gen_summary == 'Yes':
|
293 |
+
labels_sum_col_list = ['title', 'label', 'scores_from_summary']
|
294 |
+
labels_sum_df = pd.DataFrame(columns=labels_sum_col_list)
|
295 |
+
|
296 |
+
labels_full_col_list = ['title', 'label', 'scores_from_full_text']
|
297 |
+
labels_full_df = pd.DataFrame(columns=labels_full_col_list)
|
298 |
+
|
299 |
+
for i in range(0, len(text_df)):
|
300 |
+
if gen_summary == 'Yes':
|
301 |
+
s_topics, s_scores = md.classifier_zero(classifier, sequence=sum_df['summary_text'][i], labels=label_list, multi_class=True)
|
302 |
+
ls_df = pd.DataFrame({'label': s_topics, 'scores_from_summary': s_scores})
|
303 |
+
ls_df['title'] = text_df['title'][i]
|
304 |
+
labels_sum_df = pd.concat([labels_sum_df, ls_df[labels_sum_col_list]])
|
305 |
+
|
306 |
+
f_topics, f_scores = md.classifier_zero(classifier, sequence=text_df['text'][i], labels=label_list, multi_class=True)
|
307 |
+
lf_df = pd.DataFrame({'label': f_topics, 'scores_from_full_text': f_scores})
|
308 |
+
lf_df['title'] = text_df['title'][i]
|
309 |
+
labels_full_df = pd.concat([labels_full_df, lf_df[labels_full_col_list]])
|
310 |
+
|
311 |
+
with st.expander(f'({i+1}/{len(text_df)}) See intermediate label matching results for: {text_df["title"][i]}'):
|
312 |
+
if gen_summary == 'Yes':
|
313 |
+
st.dataframe(pd.merge(ls_df, lf_df, on=['title','label']))
|
314 |
+
else:
|
315 |
+
st.dataframe(lf_df)
|
316 |
+
|
317 |
+
if gen_summary == 'Yes':
|
318 |
+
label_match_df = pd.merge(labels_sum_df, labels_full_df, on=['title', 'label'])
|
319 |
+
else:
|
320 |
+
label_match_df = labels_full_df.copy()
|
321 |
+
|
322 |
+
###################################
|
323 |
+
####### Ground Truth Labels ######
|
324 |
+
###################################
|
325 |
+
if len(glabels) > 0:
|
326 |
+
gdata = pd.DataFrame({'label': glabels})
|
327 |
+
join_list = ['label']
|
328 |
+
elif uploaded_onetext_glabels_file is not None:
|
329 |
+
gdata = pd.read_csv(uploaded_onetext_glabels_file, header=None)
|
330 |
+
join_list = ['label']
|
331 |
+
gdata.columns = join_list
|
332 |
+
elif uploaded_multitext_glabels_file is not None:
|
333 |
+
gdata = pd.read_csv(uploaded_multitext_glabels_file)
|
334 |
+
join_list = ['title', 'label']
|
335 |
+
gdata.columns = join_list
|
336 |
+
|
337 |
+
if len(glabels) > 0 or uploaded_onetext_glabels_file is not None or uploaded_multitext_glabels_file is not None:
|
338 |
+
gdata['correct_match'] = True
|
339 |
+
label_match_df = pd.merge(label_match_df, gdata, how='left', on=join_list)
|
340 |
+
label_match_df['correct_match'].fillna(False, inplace=True)
|
341 |
+
|
342 |
+
st.dataframe(label_match_df) #.sort_values(['title', 'label'], ascending=[False, False]))
|
343 |
+
st.download_button(
|
344 |
+
label="Download data as CSV",
|
345 |
+
data=label_match_df.to_csv().encode('utf-8'),
|
346 |
+
file_name='title_label_sum_full.csv',
|
347 |
+
mime='title_label_sum_full/csv',
|
348 |
+
)
|
349 |
+
|
350 |
+
# if len(glabels) > 0:
|
351 |
+
# st.markdown("### Evaluation Metrics")
|
352 |
+
# with st.spinner('Evaluating output against ground truth...'):
|
353 |
+
#
|
354 |
+
# section_header_description = ['Summary Label Performance', 'Original Full Text Label Performance']
|
355 |
+
# data_headers = ['scores_from_summary', 'scores_from_full_text']
|
356 |
+
# for i in range(0,2):
|
357 |
+
# st.markdown(f"###### {section_header_description[i]}")
|
358 |
+
# report = classification_report(y_true = data2[['is_true_label']],
|
359 |
+
# y_pred = (data2[[data_headers[i]]] >= threshold_value) * 1.0,
|
360 |
+
# output_dict=True)
|
361 |
+
# df_report = pd.DataFrame(report).transpose()
|
362 |
+
# st.markdown(f"Threshold set for: {threshold_value}")
|
363 |
+
# st.dataframe(df_report)
|
364 |
+
|
365 |
+
st.success('All done!')
|
366 |
+
st.balloons()
|
app_output/20211215_output_example.pdf
ADDED
Binary file (124 kB). View file
|
|
app_output/20211223_output_example.pdf
ADDED
Binary file (139 kB). View file
|
|
app_output/20220105_output_example.pdf
ADDED
Binary file (156 kB). View file
|
|
example_long_text.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
I returned home not disappointed, for I have said that I had long considered those authors useless whom the professor reprobated; but I returned not at all the more inclined to recur to these studies in any shape. M. Krempe was a little squat man with a gruff voice and a repulsive countenance; the teacher, therefore, did not prepossess me in favour of his pursuits. In rather a too philosophical and connected a strain, perhaps, I have given an account of the conclusions I had come to concerning them in my early years. As a child I had not been content with the results promised by the modern professors of natural science. With a confusion of ideas only to be accounted for by my extreme youth and my want of a guide on such matters, I had retrod the steps of knowledge along the paths of time and exchanged the discoveries of recent inquirers for the dreams of forgotten alchemists. Besides, I had a contempt for the uses of modern natural philosophy. It was very different when the masters of the science sought immortality and power; such views, although futile, were grand; but now the scene was changed. The ambition of the inquirer seemed to limit itself to the annihilation of those visions on which my interest in science was chiefly founded. I was required to exchange chimeras of boundless grandeur for realities of little worth.
|
3 |
+
|
4 |
+
Such were my reflections during the first two or three days of my residence at Ingolstadt, which were chiefly spent in becoming acquainted with the localities and the principal residents in my new abode. But as the ensuing week commenced, I thought of the information which M. Krempe had given me concerning the lectures. And although I could not consent to go and hear that little conceited fellow deliver sentences out of a pulpit, I recollected what he had said of M. Waldman, whom I had never seen, as he had hitherto been out of town.
|
5 |
+
|
6 |
+
Partly from curiosity and partly from idleness, I went into the lecturing room, which M. Waldman entered shortly after. This professor was very unlike his colleague. He appeared about fifty years of age, but with an aspect expressive of the greatest benevolence; a few grey hairs covered his temples, but those at the back of his head were nearly black. His person was short but remarkably erect and his voice the sweetest I had ever heard. He began his lecture by a recapitulation of the history of chemistry and the various improvements made by different men of learning, pronouncing with fervour the names of the most distinguished discoverers. He then took a cursory view of the present state of the science and explained many of its elementary terms. After having made a few preparatory experiments, he concluded with a panegyric upon modern chemistry, the terms of which I shall never forget:
|
7 |
+
|
8 |
+
“The ancient teachers of this science,” said he, “promised impossibilities and performed nothing. The modern masters promise very little; they know that metals cannot be transmuted and that the elixir of life is a chimera but these philosophers, whose hands seem only made to dabble in dirt, and their eyes to pore over the microscope or crucible, have indeed performed miracles. They penetrate into the recesses of nature and show how she works in her hiding-places. They ascend into the heavens; they have discovered how the blood circulates, and the nature of the air we breathe. They have acquired new and almost unlimited powers; they can command the thunders of heaven, mimic the earthquake, and even mock the invisible world with its own shadows.”
|
9 |
+
|
10 |
+
Such were the professor’s words—rather let me say such the words of the fate—enounced to destroy me. As he went on I felt as if my soul were grappling with a palpable enemy; one by one the various keys were touched which formed the mechanism of my being; chord after chord was sounded, and soon my mind was filled with one thought, one conception, one purpose. So much has been done, exclaimed the soul of Frankenstein—more, far more, will I achieve; treading in the steps already marked, I will pioneer a new way, explore unknown powers, and unfold to the world the deepest mysteries of creation.
|
11 |
+
|
12 |
+
I closed not my eyes that night. My internal being was in a state of insurrection and turmoil; I felt that order would thence arise, but I had no power to produce it. By degrees, after the morning’s dawn, sleep came. I awoke, and my yesternight’s thoughts were as a dream. There only remained a resolution to return to my ancient studies and to devote myself to a science for which I believed myself to possess a natural talent. On the same day I paid M. Waldman a visit. His manners in private were even more mild and attractive than in public, for there was a certain dignity in his mien during his lecture which in his own house was replaced by the greatest affability and kindness. I gave him pretty nearly the same account of my former pursuits as I had given to his fellow professor. He heard with attention the little narration concerning my studies and smiled at the names of Cornelius Agrippa and Paracelsus, but without the contempt that M. Krempe had exhibited. He said that “These were men to whose indefatigable zeal modern philosophers were indebted for most of the foundations of their knowledge. They had left to us, as an easier task, to give new names and arrange in connected classifications the facts which they in a great degree had been the instruments of bringing to light. The labours of men of genius, however erroneously directed, scarcely ever fail in ultimately turning to the solid advantage of mankind.” I listened to his statement, which was delivered without any presumption or affectation, and then added that his lecture had removed my prejudices against modern chemists; I expressed myself in measured terms, with the modesty and deference due from a youth to his instructor, without letting escape (inexperience in life would have made me ashamed) any of the enthusiasm which stimulated my intended labours. I requested his advice concerning the books I ought to procure.
|
13 |
+
|
14 |
+
“I am happy,” said M. Waldman, “to have gained a disciple; and if your application equals your ability, I have no doubt of your success. Chemistry is that branch of natural philosophy in which the greatest improvements have been and may be made; it is on that account that I have made it my peculiar study; but at the same time, I have not neglected the other branches of science. A man would make but a very sorry chemist if he attended to that department of human knowledge alone. If your wish is to become really a man of science and not merely a petty experimentalist, I should advise you to apply to every branch of natural philosophy, including mathematics.”
|
15 |
+
|
16 |
+
He then took me into his laboratory and explained to me the uses of his various machines, instructing me as to what I ought to procure and promising me the use of his own when I should have advanced far enough in the science not to derange their mechanism. He also gave me the list of books which I had requested, and I took my leave.
|
17 |
+
|
18 |
+
Thus ended a day memorable to me; it decided my future destiny.
|
examples.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"text": "Such were the professor’s words—rather let me say such the words of the fate—enounced to destroy me. As he went on I felt as if my soul were grappling with a palpable enemy; one by one the various keys were touched which formed the mechanism of my being; chord after chord was sounded, and soon my mind was filled with one thought, one conception, one purpose. So much has been done, exclaimed the soul of Frankenstein—more, far more, will I achieve; treading in the steps already marked, I will pioneer a new way, explore unknown powers, and unfold to the world the deepest mysteries of creation.",
|
3 |
+
"long_text_license": "This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook.",
|
4 |
+
"labels":"Batman,Science,Sound,Light,Creation,Optics",
|
5 |
+
"ground_labels":"Science,Sound,Light"
|
6 |
+
}
|
models.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
3 |
+
import streamlit as st
|
4 |
+
from keybert import KeyBERT
|
5 |
+
import re
|
6 |
+
|
7 |
+
|
8 |
+
# Reference: https://discuss.huggingface.co/t/summarization-on-long-documents/920/7
|
9 |
+
def create_nest_sentences(document:str, token_max_length = 1024):
|
10 |
+
nested = []
|
11 |
+
sent = []
|
12 |
+
length = 0
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
|
14 |
+
|
15 |
+
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
|
16 |
+
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
|
17 |
+
length += len(tokens_in_sentence)
|
18 |
+
|
19 |
+
if length < token_max_length:
|
20 |
+
sent.append(sentence)
|
21 |
+
else:
|
22 |
+
nested.append(sent)
|
23 |
+
sent = [sentence]
|
24 |
+
length = 0
|
25 |
+
|
26 |
+
if sent:
|
27 |
+
nested.append(sent)
|
28 |
+
return nested
|
29 |
+
|
30 |
+
# Reference: https://github.com/MaartenGr/KeyBERT
|
31 |
+
@st.cache(allow_output_mutation=True)
|
32 |
+
def load_keyword_model():
|
33 |
+
kw_model = KeyBERT()
|
34 |
+
return kw_model
|
35 |
+
|
36 |
+
def keyword_gen(kw_model, sequence:str):
|
37 |
+
keywords = kw_model.extract_keywords(sequence,
|
38 |
+
keyphrase_ngram_range=(1, 1),
|
39 |
+
stop_words='english',
|
40 |
+
use_mmr=True,
|
41 |
+
diversity=0.5,
|
42 |
+
top_n=10)
|
43 |
+
return keywords
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
# Reference: https://huggingface.co/facebook/bart-large-mnli
|
48 |
+
@st.cache(allow_output_mutation=True)
|
49 |
+
def load_summary_model():
|
50 |
+
model_name = "facebook/bart-large-cnn"
|
51 |
+
summarizer = pipeline(task='summarization', model=model_name)
|
52 |
+
return summarizer
|
53 |
+
|
54 |
+
# def load_summary_model():
|
55 |
+
# model_name = "facebook/bart-large-mnli"
|
56 |
+
# tokenizer = BartTokenizer.from_pretrained(model_name)
|
57 |
+
# model = BartForConditionalGeneration.from_pretrained(model_name)
|
58 |
+
# summarizer = pipeline(task='summarization', model=model, tokenizer=tokenizer, framework='pt')
|
59 |
+
# return summarizer
|
60 |
+
|
61 |
+
def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:int):
|
62 |
+
output = summarizer(sequence,
|
63 |
+
num_beams=4,
|
64 |
+
length_penalty=2.0,
|
65 |
+
max_length=maximum_tokens,
|
66 |
+
min_length=minimum_tokens,
|
67 |
+
do_sample=False,
|
68 |
+
early_stopping = True,
|
69 |
+
no_repeat_ngram_size=3)
|
70 |
+
return output[0].get('summary_text')
|
71 |
+
|
72 |
+
|
73 |
+
# # Reference: https://www.datatrigger.org/post/nlp_hugging_face/
|
74 |
+
# # Custom summarization pipeline (to handle long articles)
|
75 |
+
# def summarize(text, minimum_length_of_summary = 100):
|
76 |
+
# # Tokenize and truncate
|
77 |
+
# inputs = tokenizer_bart([text], truncation=True, max_length=1024, return_tensors='pt').to('cuda')
|
78 |
+
# # Generate summary
|
79 |
+
# summary_ids = model_bart.generate(inputs['input_ids'], num_beams=4, min_length = minimum_length_of_summary, max_length=400, early_stopping=True)
|
80 |
+
# # Untokenize
|
81 |
+
# return([tokenizer_bart.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0])
|
82 |
+
|
83 |
+
|
84 |
+
# Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
|
85 |
+
@st.cache(allow_output_mutation=True)
|
86 |
+
def load_model():
|
87 |
+
model_name = "facebook/bart-large-mnli"
|
88 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
89 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
90 |
+
classifier = pipeline(task='zero-shot-classification', model=model, tokenizer=tokenizer, framework='pt')
|
91 |
+
return classifier
|
92 |
+
|
93 |
+
def classifier_zero(classifier, sequence:str, labels:list, multi_class:bool):
|
94 |
+
outputs = classifier(sequence, labels, multi_label=multi_class)
|
95 |
+
return outputs['labels'], outputs['scores']
|
96 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers[sentencepiece]==4.11.0
|
2 |
+
pandas==1.4.3
|
3 |
+
streamlit==1.10.0
|
4 |
+
torch==1.12.0
|
5 |
+
scikit-learn==1.1.1
|
6 |
+
KeyBERT==0.5.1
|
utils.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
# import plotly.express as px
|
5 |
+
# from plotly.subplots import make_subplots
|
6 |
+
import json
|
7 |
+
|
8 |
+
# Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
|
9 |
+
# def plot_result(top_topics, scores):
|
10 |
+
# top_topics = np.array(top_topics)
|
11 |
+
# scores = np.array(scores)
|
12 |
+
# scores *= 100
|
13 |
+
# fig = px.bar(x=np.around(scores,2), y=top_topics, orientation='h',
|
14 |
+
# labels={'x': 'Confidence Score', 'y': 'Label'},
|
15 |
+
# text=scores,
|
16 |
+
# range_x=(0,115),
|
17 |
+
# title='Predictions',
|
18 |
+
# color=np.linspace(0,1,len(scores)),
|
19 |
+
# color_continuous_scale='GnBu')
|
20 |
+
# fig.update(layout_coloraxis_showscale=False)
|
21 |
+
# fig.update_traces(texttemplate='%{text:0.1f}%', textposition='outside')
|
22 |
+
# st.plotly_chart(fig)
|
23 |
+
|
24 |
+
|
25 |
+
# def plot_dual_bar_chart(topics_summary, scores_summary, topics_text, scores_text):
|
26 |
+
# data1 = pd.DataFrame({'label': topics_summary, 'scores on summary': scores_summary})
|
27 |
+
# data2 = pd.DataFrame({'label': topics_text, 'scores on full text': scores_text})
|
28 |
+
# data = pd.merge(data1, data2, on = ['label'])
|
29 |
+
# data.sort_values('scores on summary', ascending = True, inplace = True)
|
30 |
+
|
31 |
+
# fig = make_subplots(rows=1, cols=2,
|
32 |
+
# subplot_titles=("Predictions on Summary", "Predictions on Full Text"),
|
33 |
+
# )
|
34 |
+
|
35 |
+
# fig1 = px.bar(x=round(data['scores on summary']*100, 2), y=data['label'], orientation='h',
|
36 |
+
# text=round(data['scores on summary']*100, 2),
|
37 |
+
# )
|
38 |
+
|
39 |
+
# fig2 = px.bar(x=round(data['scores on full text']*100,2), y=data['label'], orientation='h',
|
40 |
+
# text=round(data['scores on full text']*100,2),
|
41 |
+
# )
|
42 |
+
|
43 |
+
# fig.add_trace(fig1['data'][0], row=1, col=1)
|
44 |
+
# fig.add_trace(fig2['data'][0], row=1, col=2)
|
45 |
+
|
46 |
+
# fig.update_traces(texttemplate='%{text:0.1f}%', textposition='outside')
|
47 |
+
# fig.update_layout(height=600, width=700) #, title_text="Predictions for")
|
48 |
+
# fig.update_xaxes(range=[0,115])
|
49 |
+
# fig.update_xaxes(matches='x')
|
50 |
+
# fig.update_yaxes(showticklabels=False) # hide all the xticks
|
51 |
+
# fig.update_yaxes(showticklabels=True, row=1, col=1)
|
52 |
+
|
53 |
+
# st.plotly_chart(fig)
|
54 |
+
|
55 |
+
# def plot_dual_bar_chart(topics_summary, scores_summary, topics_text, scores_text):
|
56 |
+
# data1 = pd.DataFrame({'label': topics_summary, 'scores': scores_summary})
|
57 |
+
# data1['classification_on'] = 'summary'
|
58 |
+
# data2 = pd.DataFrame({'label': topics_text, 'scores': scores_text})
|
59 |
+
# data2['classification_on'] = 'full text'
|
60 |
+
# data = pd.concat([data1, data2])
|
61 |
+
# data['scores'] = round(data['scores']*100,2)
|
62 |
+
|
63 |
+
# fig = px.bar(
|
64 |
+
# data, x="scores", y="label", #orientation = 'h',
|
65 |
+
# labels={'x': 'Confidence Score', 'y': 'Label'},
|
66 |
+
# text=data['scores'],
|
67 |
+
# range_x=(0,115),
|
68 |
+
# color="label", barmode="group",
|
69 |
+
# facet_col="classification_on",
|
70 |
+
# category_orders={"classification_on": ["summary", "full text"]}
|
71 |
+
# )
|
72 |
+
# fig.update_traces(texttemplate='%{text:0.1f}%', textposition='outside')
|
73 |
+
|
74 |
+
# st.plotly_chart(fig)
|
75 |
+
|
76 |
+
|
77 |
+
def examples_load():
|
78 |
+
with open("examples.json") as f:
|
79 |
+
data=json.load(f)
|
80 |
+
return data['text'], data['long_text_license'], data['labels'], data['ground_labels']
|
81 |
+
|
82 |
+
def example_long_text_load():
|
83 |
+
with open("example_long_text.txt", "r") as f:
|
84 |
+
text_data = f.read()
|
85 |
+
return text_data
|