Spaces:
Runtime error
Runtime error
multi error clusters
Browse files
app.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
import numpy as np
|
| 4 |
import pandas as pd
|
| 5 |
import torch
|
| 6 |
-
import
|
| 7 |
from tqdm import tqdm
|
| 8 |
from math import floor
|
| 9 |
from datasets import load_dataset
|
|
@@ -104,7 +104,8 @@ def quant_panel(embedding_df):
|
|
| 104 |
st.markdown("* Each **point** is an input example.")
|
| 105 |
st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
|
| 106 |
st.markdown("* The **shape** of each point reflects the label category -- positive (diamond) or negative sentiment (circle).")
|
| 107 |
-
st.altair_chart(data_comparison(down_samp(embedding_df)), use_container_width=True)
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
|
|
@@ -136,7 +137,7 @@ def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.
|
|
| 136 |
for i, (token) in enumerate(tokens_sorted[:top_k]):
|
| 137 |
top_tokens.append(['%10s' % (tokenizer.decode(token)), '%.4f' % (token_frequencies[token]), '%.4f' % (
|
| 138 |
token_frequencies_error[token]), '%4.2f' % (token_lrs[token])])
|
| 139 |
-
return pd.DataFrame(top_tokens, columns=['Token', 'Freq', 'Freq error slice', '
|
| 140 |
|
| 141 |
|
| 142 |
@st.cache(ttl=600)
|
|
@@ -160,12 +161,12 @@ def clustering(data,num_clusters):
|
|
| 160 |
return data, assigned_clusters
|
| 161 |
|
| 162 |
def kmeans(df, num_clusters=3):
|
| 163 |
-
data_hl = df.loc[df['slice'] == 'high-loss']
|
| 164 |
-
data_kmeans,clusters = clustering(
|
| 165 |
-
merged = pd.merge(df, data_kmeans, left_index=True, right_index=True, how='outer', suffixes=('', '_y'))
|
| 166 |
-
merged.drop(merged.filter(regex='_y$').columns.tolist(),axis=1,inplace=True)
|
| 167 |
-
merged['cluster'] = merged['cluster'].fillna(num_clusters).astype('int')
|
| 168 |
-
return
|
| 169 |
|
| 170 |
def distance_from_centroid(row):
|
| 171 |
return sdist.norm(row['embedding'] - row['centroid'].tolist())
|
|
@@ -173,16 +174,16 @@ def distance_from_centroid(row):
|
|
| 173 |
@st.cache(ttl=600)
|
| 174 |
def topic_distribution(weights, smoothing=0.01):
|
| 175 |
topic_frequencies = defaultdict(float)
|
| 176 |
-
|
| 177 |
weights_uniform = np.full_like(weights, 1 / len(weights))
|
| 178 |
num_examples = len(weights)
|
| 179 |
for i in range(num_examples):
|
| 180 |
example = dataset[i]
|
| 181 |
category = example['title']
|
| 182 |
topic_frequencies[category] += weights_uniform[i]
|
| 183 |
-
|
| 184 |
|
| 185 |
-
topic_ratios = {c: (smoothing +
|
| 186 |
smoothing + topic_frequencies[c]) for c in topic_frequencies}
|
| 187 |
|
| 188 |
categories_sorted = map(lambda x: x[0], sorted(
|
|
@@ -191,11 +192,9 @@ def topic_distribution(weights, smoothing=0.01):
|
|
| 191 |
topic_distr = []
|
| 192 |
for category in categories_sorted:
|
| 193 |
topic_distr.append(['%.3f' % topic_frequencies[category], '%.3f' %
|
| 194 |
-
|
| 195 |
|
| 196 |
return pd.DataFrame(topic_distr, columns=['Overall frequency', 'Error frequency', 'Ratio', 'Category'])
|
| 197 |
-
# for category in categories_sorted:
|
| 198 |
-
# return(topic_frequencies[category], topic_frequencies_spotlight[category], topic_ratios[category], category)
|
| 199 |
|
| 200 |
def populate_session(dataset,model):
|
| 201 |
data_df = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'.parquet')
|
|
@@ -239,13 +238,17 @@ if __name__ == "__main__":
|
|
| 239 |
#populate_session(dataset, model)
|
| 240 |
data_df = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'.parquet')
|
| 241 |
loss_quantile = st.sidebar.slider(
|
| 242 |
-
"Loss Quantile", min_value=0.5, max_value=1.0,step=0.01,value=0.
|
| 243 |
)
|
|
|
|
| 244 |
data_df['loss'] = data_df['loss'].astype(float)
|
| 245 |
losses = data_df['loss']
|
| 246 |
high_loss = losses.quantile(loss_quantile)
|
| 247 |
data_df['slice'] = 'high-loss'
|
| 248 |
data_df['slice'] = data_df['slice'].where(data_df['loss'] > high_loss, 'low-loss')
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
with lcol:
|
| 251 |
st.markdown('<h3>Error Slices</h3>',unsafe_allow_html=True)
|
|
@@ -279,7 +282,16 @@ if __name__ == "__main__":
|
|
| 279 |
|
| 280 |
if run_kmeans == 'True':
|
| 281 |
with st.spinner(text='running kmeans...'):
|
| 282 |
-
merged =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
with st.spinner(text='loading visualization...'):
|
| 285 |
quant_panel(merged)
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import pandas as pd
|
| 5 |
import torch
|
| 6 |
+
import math
|
| 7 |
from tqdm import tqdm
|
| 8 |
from math import floor
|
| 9 |
from datasets import load_dataset
|
|
|
|
| 104 |
st.markdown("* Each **point** is an input example.")
|
| 105 |
st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
|
| 106 |
st.markdown("* The **shape** of each point reflects the label category -- positive (diamond) or negative sentiment (circle).")
|
| 107 |
+
#st.altair_chart(data_comparison(down_samp(embedding_df)), use_container_width=True)
|
| 108 |
+
st.altair_chart(data_comparison(embedding_df), use_container_width=True)
|
| 109 |
|
| 110 |
|
| 111 |
def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
|
|
|
|
| 137 |
for i, (token) in enumerate(tokens_sorted[:top_k]):
|
| 138 |
top_tokens.append(['%10s' % (tokenizer.decode(token)), '%.4f' % (token_frequencies[token]), '%.4f' % (
|
| 139 |
token_frequencies_error[token]), '%4.2f' % (token_lrs[token])])
|
| 140 |
+
return pd.DataFrame(top_tokens, columns=['Token', 'Freq', 'Freq error slice', 'Ratio w/ smoothing'])
|
| 141 |
|
| 142 |
|
| 143 |
@st.cache(ttl=600)
|
|
|
|
| 161 |
return data, assigned_clusters
|
| 162 |
|
| 163 |
def kmeans(df, num_clusters=3):
|
| 164 |
+
#data_hl = df.loc[df['slice'] == 'high-loss']
|
| 165 |
+
data_kmeans,clusters = clustering(df,num_clusters)
|
| 166 |
+
#merged = pd.merge(df, data_kmeans, left_index=True, right_index=True, how='outer', suffixes=('', '_y'))
|
| 167 |
+
#merged.drop(merged.filter(regex='_y$').columns.tolist(),axis=1,inplace=True)
|
| 168 |
+
#merged['cluster'] = merged['cluster'].fillna(num_clusters).astype('int')
|
| 169 |
+
return data_kmeans
|
| 170 |
|
| 171 |
def distance_from_centroid(row):
|
| 172 |
return sdist.norm(row['embedding'] - row['centroid'].tolist())
|
|
|
|
| 174 |
@st.cache(ttl=600)
|
| 175 |
def topic_distribution(weights, smoothing=0.01):
|
| 176 |
topic_frequencies = defaultdict(float)
|
| 177 |
+
topic_frequencies_error= defaultdict(float)
|
| 178 |
weights_uniform = np.full_like(weights, 1 / len(weights))
|
| 179 |
num_examples = len(weights)
|
| 180 |
for i in range(num_examples):
|
| 181 |
example = dataset[i]
|
| 182 |
category = example['title']
|
| 183 |
topic_frequencies[category] += weights_uniform[i]
|
| 184 |
+
topic_frequencies_error[category] += weights[i]
|
| 185 |
|
| 186 |
+
topic_ratios = {c: (smoothing + topic_frequencies_error[c]) / (
|
| 187 |
smoothing + topic_frequencies[c]) for c in topic_frequencies}
|
| 188 |
|
| 189 |
categories_sorted = map(lambda x: x[0], sorted(
|
|
|
|
| 192 |
topic_distr = []
|
| 193 |
for category in categories_sorted:
|
| 194 |
topic_distr.append(['%.3f' % topic_frequencies[category], '%.3f' %
|
| 195 |
+
topic_frequencies_error[category], '%.2f' % topic_ratios[category], '%s' % category])
|
| 196 |
|
| 197 |
return pd.DataFrame(topic_distr, columns=['Overall frequency', 'Error frequency', 'Ratio', 'Category'])
|
|
|
|
|
|
|
| 198 |
|
| 199 |
def populate_session(dataset,model):
|
| 200 |
data_df = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'.parquet')
|
|
|
|
| 238 |
#populate_session(dataset, model)
|
| 239 |
data_df = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'.parquet')
|
| 240 |
loss_quantile = st.sidebar.slider(
|
| 241 |
+
"Loss Quantile", min_value=0.5, max_value=1.0,step=0.01,value=0.99
|
| 242 |
)
|
| 243 |
+
data_df = data_df.drop(data_df[data_df.pred == data_df.label].index) #drop rows that are not errors
|
| 244 |
data_df['loss'] = data_df['loss'].astype(float)
|
| 245 |
losses = data_df['loss']
|
| 246 |
high_loss = losses.quantile(loss_quantile)
|
| 247 |
data_df['slice'] = 'high-loss'
|
| 248 |
data_df['slice'] = data_df['slice'].where(data_df['loss'] > high_loss, 'low-loss')
|
| 249 |
+
data_hl = data_df.drop(data_df[data_df['slice'] == 'low-loss'].index) #drop rows that are not hl
|
| 250 |
+
data_ll = data_df.drop(data_df[data_df['slice'] == 'high-loss'].index)
|
| 251 |
+
df_list = [d for _, d in data_hl.groupby(['label'])] # this is to allow clustering over each error type. fp, fn for binary classification
|
| 252 |
|
| 253 |
with lcol:
|
| 254 |
st.markdown('<h3>Error Slices</h3>',unsafe_allow_html=True)
|
|
|
|
| 282 |
|
| 283 |
if run_kmeans == 'True':
|
| 284 |
with st.spinner(text='running kmeans...'):
|
| 285 |
+
merged = pd.DataFrame()
|
| 286 |
+
ind=0
|
| 287 |
+
for df in df_list:
|
| 288 |
+
#num_clusters= int(math.sqrt(len(df)/2))
|
| 289 |
+
kmeans_df = kmeans(df,num_clusters=num_clusters)
|
| 290 |
+
#print(kmeans_df.loc[kmeans_df['cluster'].idxmax()])
|
| 291 |
+
kmeans_df['cluster'] = kmeans_df['cluster'] + ind*num_clusters
|
| 292 |
+
ind = ind+1
|
| 293 |
+
merged = pd.concat([merged, kmeans_df])
|
| 294 |
+
merged = pd.concat([merged, data_ll])
|
| 295 |
|
| 296 |
with st.spinner(text='loading visualization...'):
|
| 297 |
quant_panel(merged)
|