Ashoka74 commited on
Commit
c6253d0
·
verified ·
1 Parent(s): e2ecb84

Update uap_analyzer.py

Browse files
Files changed (1) hide show
  1. uap_analyzer.py +1012 -1010
uap_analyzer.py CHANGED
@@ -1,1010 +1,1012 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sklearn.decomposition import PCA
4
- from sklearn.cluster import KMeans
5
- from cuml.manifold import umap
6
- from cuml.cluster import hdbscan
7
- import plotly.graph_objects as go
8
- from sentence_transformers import SentenceTransformer
9
- import torch
10
- with torch.no_grad():
11
- embed_model = SentenceTransformer('embaas/sentence-transformers-e5-large-v2')
12
- embed_model.to('cuda')
13
- from sentence_transformers.util import pytorch_cos_sim, pairwise_cos_sim
14
- #from stqdm.notebook import stqdm
15
- #stqdm.pandas()
16
- import logging
17
- import pandas as pd
18
- import numpy as np
19
- from sklearn.decomposition import PCA
20
- from sklearn.cluster import KMeans
21
- import plotly.graph_objects as go
22
- import plotly.express as px
23
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
24
- import numpy as np
25
- from Levenshtein import distance
26
- import logging
27
- from sklearn.metrics import confusion_matrix
28
- import seaborn as sns
29
- import matplotlib.pyplot as plt
30
- import xgboost as xgb
31
- from xgboost import plot_importance
32
- import matplotlib.pyplot as plt
33
- from sklearn.metrics import accuracy_score, confusion_matrix
34
- from scipy.stats import chi2_contingency
35
- import matplotlib.pyplot as plt
36
- import seaborn as sns
37
- from statsmodels.graphics.mosaicplot import mosaic
38
- import pickle
39
- import pandas as pd
40
- from sklearn.model_selection import train_test_split
41
- from sklearn.metrics import confusion_matrix
42
- import seaborn as sns
43
- import matplotlib.pyplot as plt
44
- import xgboost as xgb
45
- from xgboost import plot_importance
46
- import matplotlib.pyplot as plt
47
- from sklearn.metrics import accuracy_score, confusion_matrix
48
- from scipy.stats import chi2_contingency
49
- import matplotlib.pyplot as plt
50
- import seaborn as sns
51
- from statsmodels.graphics.mosaicplot import mosaic
52
- from statsmodels.api import stats
53
- import os
54
- import time
55
- import concurrent.futures
56
- from requests.exceptions import HTTPError
57
- from stqdm import stqdm
58
- stqdm.pandas()
59
- import json
60
- import pandas as pd
61
- from openai import OpenAI
62
- import numpy as np
63
- import matplotlib.pyplot as plt
64
- import squarify
65
- import matplotlib.colors as mcolors
66
- import textwrap
67
- import pandas as pd
68
- import streamlit as st
69
- st.set_option('deprecation.showPyplotGlobalUse', False)
70
-
71
-
72
- # Configure logging
73
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
74
-
75
- class UAPAnalyzer:
76
- """
77
- A class for analyzing and clustering textual data within a pandas DataFrame using
78
- Natural Language Processing (NLP) techniques and machine learning models.
79
-
80
- Attributes:
81
- data (pd.DataFrame): The dataset containing textual data for analysis.
82
- column (str): The name of the column in the DataFrame to be analyzed.
83
- embeddings (np.ndarray): The vector representations of textual data.
84
- reduced_embeddings (np.ndarray): The dimensionality-reduced embeddings.
85
- cluster_labels (np.ndarray): The labels assigned to each data point after clustering.
86
- cluster_terms (list): The list of terms associated with each cluster.
87
- tfidf_matrix (sparse matrix): The Term Frequency-Inverse Document Frequency (TF-IDF) matrix.
88
- models (dict): A dictionary to store trained machine learning models.
89
- evaluations (dict): A dictionary to store evaluation results of models.
90
- data_nums (pd.DataFrame): The DataFrame with numerical encoding of categorical data.
91
- """
92
-
93
- def __init__(self, data, column, has_embeddings=False):
94
- """
95
- Initializes the UAPAnalyzer with a dataset and a specified column for analysis.
96
-
97
- Args:
98
- data (pd.DataFrame): The dataset for analysis.
99
- column (str): The column within the dataset to analyze.
100
- """
101
- assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame"
102
- assert column in data.columns, f"Column '{column}' not found in DataFrame"
103
- self.has_embeddings = has_embeddings
104
- self.data = data
105
- self.column = column
106
- self.embeddings = None
107
- self.reduced_embeddings = None
108
- self.cluster_labels = None
109
- self.cluster_names = None
110
- self.cluster_terms = None
111
- self.cluster_terms_embeddings = None
112
- self.tfidf_matrix = None
113
- self.models = {} # To store trained models
114
- self.evaluations = {} # To store evaluation results
115
- self.data_nums = None # Encoded numerical data
116
- self.x_train = None
117
- self.y_train = None
118
- self.x_test = None
119
- self.y_test = None
120
- self.preds = None
121
- self.new_dataset = None
122
- self.model = embed_model
123
- self.model = self.model.to('cuda')
124
- #self.cluster_names_ = pd.DataFrame()
125
-
126
- logging.info("UAPAnalyzer initialized")
127
-
128
- def preprocess_data(self, trim=False, has_embeddings=False, top_n=32,):
129
- """
130
- Preprocesses the data by optionally trimming the dataset to include only the top N labels and extracting embeddings.
131
-
132
- Args:
133
- trim (bool): Whether to trim the dataset to include only the top N labels.
134
- top_n (int): The number of top labels to retain if trimming is enabled.
135
- """
136
- logging.info("Preprocessing data")
137
-
138
- # if trim is True
139
- if trim:
140
- # Identify the top labels based on value counts
141
- top_labels = self.data[self.column].value_counts().nlargest(top_n).index.tolist()
142
- # Revise the column data, setting values to 'Other' if they are not in the top labels
143
- self.data[f'{self.column}_revised'] = np.where(self.data[self.column].isin(top_labels), self.data[self.column], 'Other')
144
- # Convert the column data to string type before passing to _extract_embeddings
145
- # This is useful especially if the data type of the column is not originally string
146
- string_data = self.data[f'{self.column}'].astype(str)
147
- # Extract embeddings from the revised and string-converted column data
148
- if has_embeddings:
149
- self.embeddings = self.data['embeddings'].to_list()
150
- else:
151
- self.embeddings = self._extract_embeddings(string_data)
152
- logging.info("Data preprocessing complete")
153
-
154
-
155
- def _extract_embeddings(self, data_column):
156
- """
157
- Extracts embeddings from the given data column.
158
-
159
- Args:
160
- data_column (pd.Series): The column from which to extract embeddings.
161
-
162
- Returns:
163
- np.ndarray: The extracted embeddings.
164
- """
165
- logging.info("Extracting embeddings")
166
- # convert to str
167
- return embed_model.encode(data_column.tolist(), show_progress_bar=True)
168
-
169
- def reduce_dimensionality(self, method='UMAP', n_components=2, **kwargs):
170
- """
171
- Reduces the dimensionality of embeddings using specified method.
172
-
173
- Args:
174
- method (str): The dimensionality reduction method to use ('UMAP' or 'PCA').
175
- n_components (int): The number of dimensions to reduce to.
176
- **kwargs: Additional keyword arguments for the dimensionality reduction method.
177
- """
178
- logging.info(f"Reducing dimensionality using {method}")
179
- if method == 'UMAP':
180
- reducer = umap.UMAP(n_components=n_components, **kwargs)
181
- elif method == 'PCA':
182
- reducer = PCA(n_components=n_components)
183
- else:
184
- raise ValueError("Unsupported dimensionality reduction method")
185
-
186
- self.reduced_embeddings = reducer.fit_transform(self.embeddings)
187
- logging.info(f"Dimensionality reduced using {method}")
188
-
189
- def cluster_data(self, method='HDBSCAN', **kwargs):
190
- """
191
- Clusters the reduced dimensionality data using the specified clustering method.
192
-
193
- Args:
194
- method (str): The clustering method to use ('HDBSCAN' or 'KMeans').
195
- **kwargs: Additional keyword arguments for the clustering method.
196
- """
197
- logging.info(f"Clustering data using {method}")
198
- if method == 'HDBSCAN':
199
- clusterer = hdbscan.HDBSCAN(**kwargs)
200
- elif method == 'KMeans':
201
- clusterer = KMeans(**kwargs)
202
- else:
203
- raise ValueError("Unsupported clustering method")
204
-
205
- clusterer.fit(self.reduced_embeddings)
206
- self.cluster_labels = clusterer.labels_
207
- logging.info(f"Data clustering complete using {method}")
208
-
209
-
210
- def get_tf_idf_clusters(self, top_n=2):
211
- """
212
- Names clusters using the most frequent terms based on TF-IDF analysis.
213
-
214
- Args:
215
- top_n (int): The number of top terms to consider for naming each cluster.
216
- """
217
- logging.info("Naming clusters based on top TF-IDF terms.")
218
-
219
- # Ensure data has been clustered
220
- assert self.cluster_labels is not None, "Data has not been clustered yet."
221
- vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
222
-
223
- # Fit the vectorizer to the text data and transform it into a TF-IDF matrix
224
- tfidf_matrix = vectorizer.fit_transform(self.data[f'{self.column}'].astype(str))
225
-
226
- # Initialize an empty list to store the cluster terms
227
- self.cluster_terms = []
228
-
229
- for cluster_id in np.unique(self.cluster_labels):
230
- # Skip noise if present (-1 in HDBSCAN)
231
- if cluster_id == -1:
232
- continue
233
-
234
- # Find indices of documents in the current cluster
235
- indices = np.where(self.cluster_labels == cluster_id)[0]
236
-
237
- # Compute the mean TF-IDF score for each term in the cluster
238
- cluster_tfidf_mean = np.mean(tfidf_matrix[indices], axis=0)
239
-
240
- # Use the matrix directly for indexing if it does not support .toarray()
241
- # Ensure it's in a format that supports indexing, convert if necessary
242
- if hasattr(cluster_tfidf_mean, "toarray"):
243
- dense_mean = cluster_tfidf_mean.toarray().flatten()
244
- else:
245
- dense_mean = np.asarray(cluster_tfidf_mean).flatten()
246
-
247
- # Get the indices of the top_n terms
248
- top_n_indices = np.argsort(dense_mean)[-top_n:]
249
-
250
- # Get the corresponding terms for these top indices
251
- terms = vectorizer.get_feature_names_out()
252
- top_terms = [terms[i] for i in top_n_indices]
253
-
254
- # Join the top_n terms with a hyphen
255
- cluster_name = '-'.join(top_terms)
256
-
257
- # Append the cluster name to the list
258
- self.cluster_terms.append(cluster_name)
259
-
260
- # Convert the list of cluster terms to a categorical data type
261
- self.cluster_terms = pd.Categorical(self.cluster_terms)
262
- logging.info("Cluster naming completed.")
263
-
264
- def merge_similar_clusters(self, distance='cosine', char_diff_threshold = 3, similarity_threshold = 0.92, embeddings = 'SBERT'):
265
- """
266
- Merges similar clusters based on cosine similarity of their associated terms.
267
-
268
- Args:
269
- similarity_threshold (float): The similarity threshold above which clusters are considered similar enough to merge.
270
- """
271
- from collections import defaultdict
272
- logging.info("Merging similar clusters")
273
-
274
- # A mapping from cluster names to a set of cluster names to be merged
275
- merge_mapping = defaultdict(set)
276
- merge_labels = defaultdict(set)
277
-
278
- if distance == 'levenshtein':
279
- distances = {}
280
- for i, name1 in enumerate(self.cluster_terms):
281
- for j, name2 in enumerate(self.cluster_terms[i + 1:], start=i + 1):
282
- dist = distance(name1, name2)
283
- if dist <= char_diff_threshold:
284
- logging.info(f"Merging '{name2}' into '{name1}'")
285
- merge_mapping[name1].add(name2)
286
-
287
- elif distance == 'cosine':
288
- self.cluster_terms_embeddings = embed_model.encode(self.cluster_terms)
289
- cos_sim_matrix = pytorch_cos_sim(self.cluster_terms_embeddings, self.cluster_terms_embeddings)
290
- for i, name1 in enumerate(self.cluster_terms):
291
- for j, name2 in enumerate(self.cluster_terms[i + 1:], start=i + 1):
292
- if cos_sim_matrix[i][j] > similarity_threshold:
293
- #st.write(f"Merging cluster '{name2}' into cluster '{name1}' based on cosine similarity")
294
- logging.info(f"Merging cluster '{name2}' into cluster '{name1}' based on cosine similarity")
295
- merge_mapping[name1].add(name2)
296
-
297
-
298
- # Flatten the merge mapping to a simple name change mapping
299
- name_change_mapping = {}
300
- for cluster_name, merges in merge_mapping.items():
301
- for merge_name in merges:
302
- name_change_mapping[merge_name] = cluster_name
303
-
304
- # Update cluster labels based on name changes
305
- updated_cluster_terms = []
306
- original_to_updated_index = {}
307
- for i, name in enumerate(self.cluster_terms):
308
- updated_name = name_change_mapping.get(name, name)
309
- if updated_name not in updated_cluster_terms:
310
- updated_cluster_terms.append(updated_name)
311
- original_to_updated_index[i] = len(updated_cluster_terms) - 1
312
- else:
313
- updated_index = updated_cluster_terms.index(updated_name)
314
- original_to_updated_index[i] = updated_index
315
-
316
- self.cluster_terms = updated_cluster_terms # Update cluster terms with merged names
317
- self.clusters_labels = np.array([original_to_updated_index[label] for label in self.cluster_labels])
318
-
319
-
320
- # Update cluster labels according to the new index mapping
321
- # self.cluster_labels = np.array([original_to_updated_index[label] if label in original_to_updated_index else -1 for label in self.cluster_labels])
322
- # self.cluster_terms = [self.cluster_terms[original_to_updated_index[label]] if label != -1 else 'Noise' for label in self.cluster_labels]
323
-
324
- # Log the total number of merges
325
- total_merges = sum(len(merges) for merges in merge_mapping.values())
326
- logging.info(f"Total clusters merged: {total_merges}")
327
-
328
- unique_labels = np.unique(self.cluster_labels)
329
- label_to_index = {label: index for index, label in enumerate(unique_labels)}
330
- self.cluster_labels = np.array([label_to_index[label] for label in self.cluster_labels])
331
- self.cluster_terms = [self.cluster_terms[label_to_index[label]] for label in self.cluster_labels]
332
-
333
- def merge_similar_clusters2(self, distance='cosine', char_diff_threshold=3, similarity_threshold=0.92):
334
- logging.info("Merging similar clusters based on distance: {}".format(distance))
335
- from collections import defaultdict
336
- merge_mapping = defaultdict(set)
337
-
338
- if distance == 'levenshtein':
339
- for i, name1 in enumerate(self.cluster_terms):
340
- for j, name2 in enumerate(self.cluster_terms[i + 1:], start=i + 1):
341
- dist = distance(name1, name2)
342
- if dist <= char_diff_threshold:
343
- merge_mapping[name1].add(name2)
344
- logging.info(f"Merging '{name2}' into '{name1}' based on Levenshtein distance")
345
-
346
- elif distance == 'cosine':
347
- if self.cluster_terms_embeddings is None:
348
- self.cluster_terms_embeddings = embed_model.encode(self.cluster_terms)
349
- cos_sim_matrix = pytorch_cos_sim(self.cluster_terms_embeddings, self.cluster_terms_embeddings)
350
- for i in range(len(self.cluster_terms)):
351
- for j in range(i + 1, len(self.cluster_terms)):
352
- if cos_sim_matrix[i][j] > similarity_threshold:
353
- merge_mapping[self.cluster_terms[i]].add(self.cluster_terms[j])
354
- #st.write(f"Merging cluster '{self.cluster_terms[j]}' into cluster '{self.cluster_terms[i]}'")
355
- logging.info(f"Merging cluster '{self.cluster_terms[j]}' into cluster '{self.cluster_terms[i]}'")
356
-
357
- self._update_cluster_terms_and_labels(merge_mapping)
358
-
359
- def _update_cluster_terms_and_labels(self, merge_mapping):
360
- # Flatten the merge mapping to a simple name change mapping
361
- name_change_mapping = {old: new for new, olds in merge_mapping.items() for old in olds}
362
- # Update cluster terms and labels
363
- unique_new_terms = list(set(name_change_mapping.values()))
364
- # replace the old terms with the new terms (name2) otherwise, keep the old terms (name1)
365
- # self.cluster_terms = [name_change_mapping.get(term, term) for term in self.cluster_terms]
366
- # self.cluster_labels = np.array([unique_new_terms.index(term) if term in unique_new_terms else term for term in self.cluster_terms])
367
- self.cluster_terms = [name_change_mapping.get(term, term) for term in self.cluster_terms]
368
- self.cluster_labels = [unique_new_terms.index(term) if term in unique_new_terms else -1 for term in self.cluster_terms]
369
-
370
- logging.info(f"Total clusters merged: {len(merge_mapping)}")
371
-
372
-
373
- def cluster_levenshtein(self, cluster_terms, cluster_labels, char_diff_threshold=3):
374
- from Levenshtein import distance # Make sure to import the correct distance function
375
-
376
- merge_map = {}
377
- # Iterate over term pairs and decide on merging based on the distance
378
- for idx, term1 in enumerate(cluster_terms):
379
- for jdx, term2 in enumerate(cluster_terms):
380
- if idx < jdx and distance(term1, term2) <= char_diff_threshold:
381
- labels_to_merge = [label for label, term_index in enumerate(cluster_labels) if term_index == jdx]
382
- for label in labels_to_merge:
383
- merge_map[label] = idx # Map the label to use the term index of term1
384
- logging.info(f"Merging '{term2}' into '{term1}'")
385
- st.write(f"Merging '{term2}' into '{term1}'")
386
- # Update the cluster labels
387
- updated_cluster_labels = [merge_map.get(label, label) for label in cluster_labels]
388
- # Update string labels to reflect merged labels
389
- updated_string_labels = [cluster_terms[label] for label in updated_cluster_labels]
390
- return updated_string_labels
391
-
392
- def cluster_cosine(self, cluster_terms, cluster_labels, similarity_threshold):
393
- from sklearn.metrics.pairwise import cosine_similarity
394
- cluster_terms_embeddings = embed_model.encode(cluster_terms)
395
- # Compute cosine similarity matrix in a vectorized form
396
- cos_sim_matrix = cosine_similarity(cluster_terms_embeddings, cluster_terms_embeddings)
397
-
398
- merge_map = {}
399
- n_terms = len(cluster_terms)
400
- # Iterate only over upper triangular matrix excluding diagonal to avoid redundant computations and self-comparison
401
- for idx in range(n_terms):
402
- for jdx in range(idx + 1, n_terms):
403
- if cos_sim_matrix[idx, jdx] >= similarity_threshold:
404
- labels_to_merge = [label for label, term_index in enumerate(cluster_labels) if term_index == jdx]
405
- for label in labels_to_merge:
406
- merge_map[label] = idx
407
- st.write(f"Merging '{cluster_terms[jdx]}' into '{cluster_terms[idx]}'")
408
- logging.info(f"Merging '{cluster_terms[jdx]}' into '{cluster_terms[idx]}'")
409
- # Update the cluster labels
410
- updated_cluster_labels = [merge_map.get(label, label) for label in cluster_labels]
411
- # Update string labels to reflect merged labels
412
- updated_string_labels = [cluster_terms[label] for label in updated_cluster_labels]
413
- # make a dataframe with index, cluster label and cluster term
414
- return updated_string_labels
415
-
416
- def merge_similar_clusters(self, cluster_terms, cluster_labels, distance_type='cosine', char_diff_threshold=3, similarity_threshold=0.92):
417
- if distance_type == 'levenshtein':
418
- return self.cluster_levenshtein(cluster_terms, cluster_labels, char_diff_threshold)
419
- elif distance_type == 'cosine':
420
- return self.cluster_cosine(cluster_terms, cluster_labels, similarity_threshold)
421
-
422
- def plot_embeddings2(self, title=None):
423
- assert self.reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
424
- assert self.cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
425
-
426
- logging.info("Plotting embeddings with TF-IDF colors")
427
-
428
- fig = go.Figure()
429
-
430
- unique_cluster_terms = np.unique(self.cluster_terms)
431
-
432
- for cluster_term in unique_cluster_terms:
433
- if cluster_term != 'Noise':
434
- indices = np.where(np.array(self.cluster_terms) == cluster_term)[0]
435
-
436
- # Plot points in the current cluster
437
- fig.add_trace(
438
- go.Scatter(
439
- x=self.reduced_embeddings[indices, 0],
440
- y=self.reduced_embeddings[indices, 1],
441
- mode='markers',
442
- marker=dict(
443
- size=5,
444
- opacity=0.8,
445
- ),
446
- name=cluster_term,
447
- text=self.data[f'{self.column}'].iloc[indices],
448
- hoverinfo='text',
449
- )
450
- )
451
- else:
452
- # Plot noise points differently if needed
453
- fig.add_trace(
454
- go.Scatter(
455
- x=self.reduced_embeddings[indices, 0],
456
- y=self.reduced_embeddings[indices, 1],
457
- mode='markers',
458
- marker=dict(
459
- size=5,
460
- opacity=0.5,
461
- color='grey'
462
- ),
463
- name='Noise',
464
- text=[self.data[f'{self.column}'][i] for i in indices], # Adjusted for potential pandas use
465
- hoverinfo='text',
466
- )
467
- )
468
- # else:
469
- # indices = np.where(np.array(self.cluster_terms) == 'Noise')[0]
470
-
471
- # # Plot noise points
472
- # fig.add_trace(
473
- # go.Scatter(
474
- # x=self.reduced_embeddings[indices, 0],
475
- # y=self.reduced_embeddings[indices, 1],
476
- # mode='markers',
477
- # marker=dict(
478
- # size=5,
479
- # opacity=0.8,
480
- # ),
481
- # name='Noise',
482
- # text=self.data[f'{self.column}'].iloc[indices],
483
- # hoverinfo='text',
484
- # )
485
- # )
486
-
487
- fig.update_layout(title=title, showlegend=True, legend_title_text='Top TF-IDF Terms')
488
- #return fig
489
- st.plotly_chart(fig, use_container_width=True)
490
- #fig.show()
491
- #logging.info("Embeddings plotted with TF-IDF colors")
492
-
493
- def plot_embeddings3(self, title=None):
494
- assert self.reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
495
- assert self.cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
496
-
497
- logging.info("Plotting embeddings with TF-IDF colors")
498
-
499
- fig = go.Figure()
500
-
501
- unique_cluster_terms = np.unique(self.cluster_terms)
502
-
503
- terms_order = {term: i for i, term in enumerate(np.unique(self.cluster_terms, return_index=True)[0])}
504
- #indices = np.argsort([terms_order[term] for term in self.cluster_terms])
505
-
506
- # Handling color assignment, especially for noise
507
- colors = {term: ('grey' if term == 'Noise' else None) for term in unique_cluster_terms}
508
- color_map = px.colors.qualitative.Plotly # Default color map from Plotly Express for consistency
509
-
510
- # Apply a custom color map, handling 'Noise' specifically
511
- color_idx = 0
512
- for cluster_term in unique_cluster_terms:
513
- indices = np.where(np.array(self.cluster_terms) == cluster_term)[0]
514
- if cluster_term != 'Noise':
515
- marker_color = color_map[color_idx % len(color_map)]
516
- color_idx += 1
517
- else:
518
- marker_color = 'grey'
519
-
520
- fig.add_trace(
521
- go.Scatter(
522
- x=self.reduced_embeddings[indices, 0],
523
- y=self.reduced_embeddings[indices, 1],
524
- mode='markers',
525
- marker=dict(
526
- size=5,
527
- opacity=(0.5 if cluster_term == 'Noise' else 0.8),
528
- color=marker_color
529
- ),
530
- name=cluster_term,
531
- text=self.data[f'{self.column}'].iloc[indices],
532
- hoverinfo='text'
533
- )
534
- )
535
- fig.data = sorted(fig.data, key=lambda trace: terms_order[trace.name])
536
- fig.update_layout(title=title if title else "Embeddings Visualized", showlegend=True, legend_title_text='Top TF-IDF Terms')
537
- st.plotly_chart(fig, use_container_width=True)
538
-
539
-
540
- def plot_embeddings(self, title=None):
541
- """
542
- Plots the reduced dimensionality embeddings with clusters indicated.
543
-
544
- Args:
545
- title (str): The title of the plot.
546
- """
547
- # Ensure dimensionality reduction and TF-IDF based cluster naming have been performed
548
- assert self.reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
549
- assert self.cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
550
-
551
- logging.info("Plotting embeddings with TF-IDF colors")
552
-
553
- fig = go.Figure()
554
-
555
- #for i, term in enumerate(self.cluster_terms):
556
- # Indices of points in the current cluster
557
- #unique_cluster_ids = np.unique(self.cluster_labels[self.cluster_labels != -1]) # Exclude noise
558
- unique_cluster_terms = np.unique(self.cluster_terms)
559
- unique_cluster_labels = np.unique(self.cluster_labels)
560
-
561
- for i, (cluster_id, cluster_terms) in enumerate(zip(unique_cluster_labels, unique_cluster_terms)):
562
- indices = np.where(self.cluster_labels == cluster_id)[0]
563
- #indices = np.where(self.cluster_labels == i)[0]
564
-
565
- # Plot points in the current cluster
566
- fig.add_trace(
567
- go.Scatter(
568
- x=self.reduced_embeddings[indices, 0],
569
- y=self.reduced_embeddings[indices, 1],
570
- mode='markers',
571
- marker=dict(
572
- #color=i,
573
- #colorscale='rainbow',
574
- size=5,
575
- opacity=0.8,
576
- ),
577
- name=cluster_terms,
578
- text=self.data[f'{self.column}'].iloc[indices],
579
- hoverinfo='text',
580
- )
581
- )
582
-
583
-
584
- fig.update_layout(title=title, showlegend=True, legend_title_text='Top TF-IDF Terms')
585
- st.plotly_chart(fig, use_container_width=True)
586
- logging.info("Embeddings plotted with TF-IDF colors")
587
-
588
- def plot_embeddings4(self, title=None, cluster_terms=None, cluster_labels=None, reduced_embeddings=None, column=None, data=None):
589
- """
590
- Plots the reduced dimensionality embeddings with clusters indicated.
591
-
592
- Args:
593
- title (str): The title of the plot.
594
- """
595
- # Ensure dimensionality reduction and TF-IDF based cluster naming have been performed
596
- assert reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
597
- assert cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
598
-
599
- logging.info("Plotting embeddings with TF-IDF colors")
600
-
601
- fig = go.Figure()
602
-
603
- # Determine unique cluster IDs and terms, and ensure consistent color mapping
604
- unique_cluster_ids = np.unique(cluster_labels)
605
- unique_cluster_terms = [cluster_terms[i] for i in unique_cluster_ids]#if i != -1] # Exclude noise by ID
606
-
607
- color_map = px.colors.qualitative.Plotly # Using Plotly Express's qualitative colors for consistency
608
- color_idx = 0
609
-
610
- # Map each cluster ID to a color
611
- cluster_colors = {}
612
- for cid in unique_cluster_ids:
613
- #if cid != -1: # Exclude noise
614
- cluster_colors[cid] = color_map[color_idx % len(color_map)]
615
- color_idx += 1
616
- #else:
617
- # cluster_colors[cid] = 'grey' # Noise or outliers in grey
618
-
619
- for cluster_id, cluster_term in zip(unique_cluster_ids, unique_cluster_terms):
620
- indices = np.where(cluster_labels == cluster_id)[0]
621
- fig.add_trace(
622
- go.Scatter(
623
- x=reduced_embeddings[indices, 0],
624
- y=reduced_embeddings[indices, 1],
625
- mode='markers',
626
- marker=dict(
627
- color=cluster_colors[cluster_id],
628
- size=5,
629
- opacity=0.8#if cluster_id != -1 else 0.5,
630
- ),
631
- name=cluster_term,
632
- text=data[column].iloc[indices], # Use the original column for hover text
633
- hoverinfo='text',
634
- )
635
- )
636
-
637
- fig.update_layout(
638
- title=title if title else "Embeddings Visualized",
639
- showlegend=True,
640
- legend_title_text='Top TF-IDF Terms',
641
- legend=dict(
642
- traceorder='normal', # 'normal' or 'reversed'; ensures that traces appear in the order they are added
643
- itemsizing='constant'
644
- )
645
- )
646
- st.plotly_chart(fig, use_container_width=True)
647
- logging.info("Embeddings plotted with TF-IDF colors")
648
-
649
-
650
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
651
-
652
- def analyze_and_predict(data, analyzers, col_names):
653
- """
654
- Performs analysis on the data using provided analyzers and makes predictions on specified columns.
655
-
656
- Args:
657
- data (pd.DataFrame): The dataset for analysis.
658
- analyzers (list): A list of UAPAnalyzer instances.
659
- col_names (list): Column names to be analyzed and predicted.
660
- """
661
- new_data = pd.DataFrame()
662
- for i, (column, analyzer) in enumerate(zip(col_names, analyzers)):
663
- new_data[f'Analyzer_{column}'] = analyzer.__dict__['cluster_terms']
664
- logging.info(f"Cluster terms extracted for {column}")
665
-
666
- new_data = new_data.fillna('null').astype('category')
667
- data_nums = new_data.apply(lambda x: x.cat.codes)
668
-
669
- for col in data_nums.columns:
670
- try:
671
- categories = new_data[col].cat.categories
672
- x_train, x_test, y_train, y_test = train_test_split(data_nums.drop(columns=[col]), data_nums[col], test_size=0.2, random_state=42)
673
- bst, accuracy, preds = train_xgboost(x_train, y_train, x_test, y_test, len(categories))
674
- plot_results(new_data, bst, x_test, y_test, preds, categories, accuracy, col)
675
- except Exception as e:
676
- logging.error(f"Error processing {col}: {e}")
677
- return new_data
678
-
679
- def train_xgboost(x_train, y_train, x_test, y_test, num_classes):
680
- """
681
- Trains an XGBoost model and evaluates its performance.
682
-
683
- Args:
684
- x_train (pd.DataFrame): Training features.
685
- y_train (pd.Series): Training labels.
686
- x_test (pd.DataFrame): Test features.
687
- y_test (pd.Series): Test labels.
688
- num_classes (int): The number of unique classes in the target variable.
689
-
690
- Returns:
691
- bst (Booster): The trained XGBoost model.
692
- accuracy (float): The accuracy of the model on the test set.
693
- """
694
- dtrain = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
695
- dtest = xgb.DMatrix(x_test, label=y_test)
696
-
697
- params = {'device':'cuda', 'objective': 'multi:softmax', 'num_class': num_classes, 'max_depth': 6, 'eta': 0.3}
698
- num_round = 100
699
- bst = xgb.train(dtrain=dtrain, params=params, num_boost_round=num_round)
700
- preds = bst.predict(dtest)
701
- accuracy = accuracy_score(y_test, preds)
702
-
703
- logging.info(f"XGBoost trained with accuracy: {accuracy:.2f}")
704
- return bst, accuracy, preds
705
-
706
- def plot_results(new_data, bst, x_test, y_test, preds, categories, accuracy, col):
707
- """
708
- Plots the feature importance, confusion matrix, and contingency table.
709
-
710
- Args:
711
- bst (Booster): The trained XGBoost model.
712
- x_test (pd.DataFrame): Test features.
713
- y_test (pd.Series): Test labels.
714
- preds (np.array): Predictions made by the model.
715
- categories (Index): Category names for the target variable.
716
- accuracy (float): The accuracy of the model on the test set.
717
- col (str): The target column name being analyzed and predicted.
718
- """
719
- fig, axs = plt.subplots(1, 3, figsize=(25, 5), dpi=300)
720
- fig.suptitle(f'{col.split(sep=".")[-1]} prediction', fontsize=35)
721
-
722
- plot_importance(bst, ax=axs[0], importance_type='gain', show_values=False)
723
- conf_matrix = confusion_matrix(y_test, preds)
724
- sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=categories, yticklabels=categories, ax=axs[1])
725
- axs[1].set_title(f'Confusion Matrix\nAccuracy: {accuracy * 100:.2f}%')
726
- # make axes rotated
727
- axs[1].set_yticklabels(axs[1].get_yticklabels(), rotation=30, ha='right')
728
- sorted_features = sorted(bst.get_score(importance_type="gain").items(), key=lambda x: x[1], reverse=True)
729
- # The most important feature is the first element in the sorted list
730
- most_important_feature = sorted_features[0][0]
731
- # Create a contingency table
732
- contingency_table = pd.crosstab(new_data[col], new_data[most_important_feature])
733
-
734
- # resid pearson is used to calculate the residuals, which
735
- table = stats.Table(contingency_table).resid_pearson
736
- #print(table)
737
- # Perform the chi-squared test
738
- chi2, p, dof, expected = chi2_contingency(contingency_table)
739
- # Print the results
740
- print(f"Chi-squared test for {col} and {most_important_feature}: p-value = {p}")
741
-
742
- sns.heatmap(table, annot=True, cmap='Greens', ax=axs[2])
743
- # make axis rotated
744
- axs[2].set_yticklabels(axs[2].get_yticklabels(), rotation=30, ha='right')
745
- axs[2].set_title(f'Contingency Table between {col.split(sep=".")[-1]} and {most_important_feature.split(sep=".")[-1]}\np-value = {p}')
746
-
747
- plt.tight_layout()
748
- #plt.savefig(f"{col}_{accuracy:.2f}_prediction_XGB.jpeg", dpi=300)
749
- return plt
750
-
751
- def cramers_v(confusion_matrix):
752
- """Calculate Cramer's V statistic for categorical-categorical association."""
753
- chi2 = chi2_contingency(confusion_matrix)[0]
754
- n = confusion_matrix.sum().sum()
755
- phi2 = chi2 / n
756
- r, k = confusion_matrix.shape
757
- phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
758
- r_corr = r - ((r-1)**2)/(n-1)
759
- k_corr = k - ((k-1)**2)/(n-1)
760
- return np.sqrt(phi2corr / min((k_corr-1), (r_corr-1)))
761
-
762
- def plot_cramers_v_heatmap(data, significance_level=0.05):
763
- """Plot heatmap of Cramer's V statistic for each pair of categorical variables in a DataFrame."""
764
- # Initialize a DataFrame to store Cramer's V values
765
- cramers_v_df = pd.DataFrame(index=data.columns, columns=data.columns, data=np.nan)
766
-
767
- # Compute Cramer's V for each pair of columns
768
- for col1 in data.columns:
769
- for col2 in data.columns:
770
- if col1 != col2: # Avoid self-comparison
771
- confusion_matrix = pd.crosstab(data[col1], data[col2])
772
- chi2, p, dof, expected = chi2_contingency(confusion_matrix)
773
- # Check if the p-value is less than the significance level
774
- #if p < significance_level:
775
- # cramers_v_df.at[col1, col2] = cramers_v(confusion_matrix)
776
- # alternatively, you can use the following line to include all pairs
777
- cramers_v_df.at[col1, col2] = cramers_v(confusion_matrix)
778
-
779
- # Plot the heatmap
780
- plt.figure(figsize=(12, 10), dpi=200)
781
- mask = np.triu(np.ones_like(cramers_v_df, dtype=bool)) # Mask for the upper triangle
782
- # make a max and min of the cmap
783
- sns.heatmap(cramers_v_df, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, mask=mask, square=True)
784
- plt.title(f"Heatmap of Cramér's V (p < {significance_level})")
785
- return plt
786
-
787
-
788
- class UAPVisualizer:
789
- def __init__(self, data=None):
790
- pass # Initialization can be added if needed
791
-
792
- def analyze_and_predict(self, data, analyzers, col_names):
793
- new_data = pd.DataFrame()
794
- for i, (column, analyzer) in enumerate(zip(col_names, analyzers)):
795
- new_data[f'Analyzer_{column}'] = analyzer.__dict__['cluster_terms']
796
- print(f"Cluster terms extracted for {column}")
797
-
798
- new_data = new_data.fillna('null').astype('category')
799
- data_nums = new_data.apply(lambda x: x.cat.codes)
800
-
801
- for col in data_nums.columns:
802
- try:
803
- categories = new_data[col].cat.categories
804
- x_train, x_test, y_train, y_test = train_test_split(data_nums.drop(columns=[col]), data_nums[col], test_size=0.2, random_state=42)
805
- bst, accuracy, preds = self.train_xgboost(x_train, y_train, x_test, y_test, len(categories))
806
- self.plot_results(new_data, bst, x_test, y_test, preds, categories, accuracy, col)
807
- except Exception as e:
808
- print(f"Error processing {col}: {e}")
809
-
810
- def train_xgboost(self, x_train, y_train, x_test, y_test, num_classes):
811
- dtrain = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
812
- dtest = xgb.DMatrix(x_test, label=y_test)
813
-
814
- params = {'objective': 'multi:softmax', 'num_class': num_classes, 'max_depth': 6, 'eta': 0.3}
815
- num_round = 100
816
- bst = xgb.train(dtrain=dtrain, params=params, num_boost_round=num_round)
817
- preds = bst.predict(dtest)
818
- accuracy = accuracy_score(y_test, preds)
819
-
820
- print(f"XGBoost trained with accuracy: {accuracy:.2f}")
821
- return bst, accuracy, preds
822
-
823
- def plot_results(self, new_data, bst, x_test, y_test, preds, categories, accuracy, col):
824
- fig, axs = plt.subplots(1, 3, figsize=(25, 5))
825
- fig.suptitle(f'{col.split(sep=".")[-1]} prediction', fontsize=35)
826
-
827
- plot_importance(bst, ax=axs[0], importance_type='gain', show_values=False)
828
- conf_matrix = confusion_matrix(y_test, preds)
829
- sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=categories, yticklabels=categories, ax=axs[1])
830
- axs[1].set_title(f'Confusion Matrix\nAccuracy: {accuracy * 100:.2f}%')
831
-
832
- sorted_features = sorted(bst.get_score(importance_type="gain").items(), key=lambda x: x[1], reverse=True)
833
- most_important_feature = sorted_features[0][0]
834
- contingency_table = pd.crosstab(new_data[col], new_data[most_important_feature])
835
- chi2, p, dof, expected = chi2_contingency(contingency_table)
836
- print(f"Chi-squared test for {col} and {most_important_feature}: p-value = {p}")
837
-
838
- sns.heatmap(contingency_table, annot=True, cmap='Greens', ax=axs[2])
839
- axs[2].set_title(f'Contingency Table between {col.split(sep=".")[-1]} and {most_important_feature.split(sep=".")[-1]}\np-value = {p}')
840
-
841
- plt.tight_layout()
842
- plt.savefig(f"{col}_{accuracy:.2f}_prediction_XGB.jpeg", dpi=300)
843
- plt.show()
844
-
845
- @staticmethod
846
- def cramers_v(confusion_matrix):
847
- chi2 = chi2_contingency(confusion_matrix)[0]
848
- n = confusion_matrix.sum().sum()
849
- phi2 = chi2 / n
850
- r, k = confusion_matrix.shape
851
- phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
852
- r_corr = r - ((r-1)**2)/(n-1)
853
- k_corr = k - ((k-1)**2)/(n-1)
854
- return np.sqrt(phi2corr / min((k_corr-1), (r_corr-1)))
855
-
856
- def plot_cramers_v_heatmap(self, data, significance_level=0.05):
857
- cramers_v_df = pd.DataFrame(index=data.columns, columns=data.columns, data=np.nan)
858
-
859
- for col1 in data.columns:
860
- for col2 in data.columns:
861
- if col1 != col2:
862
- confusion_matrix = pd.crosstab(data[col1], data[col2])
863
- chi2, p, dof, expected = chi2_contingency(confusion_matrix)
864
- if p < significance_level:
865
- cramers_v_df.at[col1, col2] = UAPVisualizer.cramers_v(confusion_matrix)
866
-
867
- plt.figure(figsize=(10, 8)),# facecolor="black")
868
- mask = np.triu(np.ones_like(cramers_v_df, dtype=bool))
869
- #sns.set_theme(style="dark", rc={"axes.facecolor": "black", "grid.color": "white", "xtick.color": "white", "ytick.color": "white", "axes.labelcolor": "white", "axes.titlecolor": "white"})
870
- # ax = sns.heatmap(cramers_v_df, annot=True, fmt=".1f", linewidths=.5, linecolor='white', cmap='coolwarm', annot_kws={"color":"white"}, cbar=True, mask=mask, square=True)
871
- # Customizing the color of the ticks and labels to white
872
- # plt.xticks(color='white')
873
- # plt.yticks(color='white')
874
- sns.heatmap(cramers_v_df, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, mask=mask, square=True)
875
- plt.title(f"Heatmap of Cramér's V (p < {significance_level})")
876
- plt.show()
877
-
878
-
879
- def plot_treemap(self, df, column, top_n=32):
880
- # Get the value counts and the top N labels
881
- value_counts = df[column].value_counts()
882
- top_labels = value_counts.iloc[:top_n].index
883
-
884
-
885
- # Use np.where to replace all values not in the top N with 'Other'
886
- revised_column = f'{column}_revised'
887
- df[revised_column] = np.where(df[column].isin(top_labels), df[column], 'Other')
888
-
889
- # Get the value counts including the 'Other' category
890
- sizes = df[revised_column].value_counts().values
891
- labels = df[revised_column].value_counts().index
892
-
893
- # Get a gradient of colors
894
- colors = list(mcolors.TABLEAU_COLORS.values())
895
-
896
- # Get % of each category
897
- percents = sizes / sizes.sum()
898
-
899
- # Prepare labels with percentages
900
- labels = [f'{label}\n {percent:.1%}' for label, percent in zip(labels, percents)]
901
-
902
- # Plot the treemap
903
- squarify.plot(sizes=sizes, label=labels, alpha=0.7, pad=True, color=colors, text_kwargs={'fontsize': 10})
904
-
905
- ax = plt.gca()
906
-
907
- # Iterate over text elements and rectangles (patches) in the axes for color adjustment
908
- for text, rect in zip(ax.texts, ax.patches):
909
- background_color = rect.get_facecolor()
910
- r, g, b, _ = mcolors.to_rgba(background_color)
911
- brightness = np.average([r, g, b])
912
- text.set_color('white' if brightness < 0.5 else 'black')
913
-
914
- # Adjust font size based on rectangle's area and wrap long text
915
- coef = 0.8
916
- font_size = np.sqrt(rect.get_width() * rect.get_height()) * coef
917
- text.set_fontsize(font_size)
918
- wrapped_text = textwrap.fill(text.get_text(), width=20)
919
- text.set_text(wrapped_text)
920
-
921
- plt.axis('off')
922
- plt.gca().invert_yaxis()
923
- plt.gcf().set_size_inches(20, 12)
924
- plt.show()
925
-
926
-
927
-
928
-
929
- class UAPParser:
930
- def __init__(self, api_key, model="gpt-3.5-turbo-0125", col=None, format_long=None):
931
- os.environ['OPENAI_API_KEY'] = api_key
932
- self.client = OpenAI()
933
- self.model = model
934
- self.responses = {}
935
- self.col = None
936
-
937
- def fetch_response(self, description, format_long):
938
- INITIAL_WAIT_TIME = 5
939
- MAX_WAIT_TIME = 600
940
- MAX_RETRIES = 10
941
-
942
- wait_time = INITIAL_WAIT_TIME
943
- for attempt in range(MAX_RETRIES):
944
- try:
945
- response = self.client.chat.completions.create(
946
- model=self.model,
947
- response_format={"type": "json_object"},
948
- messages=[
949
- {"role": "system", "content": "You are a helpful assistant which is tasked to help parse data."},
950
- {"role": "user", "content": f'Input report: {description}\n\n Parse data following this json structure; leave missing data empty: {format_long} Output:'}
951
- ]
952
- )
953
- return response
954
- except HTTPError as e:
955
- if 'TooManyRequests' in str(e):
956
- time.sleep(wait_time)
957
- wait_time = min(wait_time * 2, MAX_WAIT_TIME) # Exponential backoff
958
- else:
959
- raise
960
- except Exception as e:
961
- print(f"Unexpected error: {e}")
962
- break
963
-
964
- return None # Return None if all retries fail
965
-
966
- def process_descriptions(self, descriptions, format_long, max_workers=32):
967
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
968
- future_to_desc = {executor.submit(self.fetch_response, desc, format_long): desc for desc in descriptions}
969
-
970
- for future in stqdm(concurrent.futures.as_completed(future_to_desc), total=len(descriptions)):
971
- desc = future_to_desc[future]
972
- try:
973
- response = future.result()
974
- response_text = response.choices[0].message.content if response else None
975
- if response_text:
976
- self.responses[desc] = response_text
977
- except Exception as exc:
978
- print(f'Error occurred for description {desc}: {exc}')
979
-
980
- def parse_responses(self):
981
- parsed_responses = {}
982
- not_parsed = 0
983
- try:
984
- for k, v in self.responses.items():
985
- try:
986
- parsed_responses[k] = json.loads(v)
987
- except:
988
- try:
989
- parsed_responses[k] = json.loads(v.replace("'", '"'))
990
- except:
991
- not_parsed += 1
992
- except Exception as e:
993
- print(f"Error parsing responses: {e}")
994
-
995
- print(f"Number of unparsed responses: {not_parsed}")
996
- print(f"Number of parsed responses: {len(parsed_responses)}")
997
- return parsed_responses
998
-
999
- def responses_to_df(self, col, parsed_responses):
1000
- parsed_df = pd.DataFrame(parsed_responses).T
1001
- if col is not None:
1002
- parsed_df2 = pd.json_normalize(parsed_df[col])
1003
- parsed_df2.index = parsed_df.index
1004
- else:
1005
- parsed_df2 = pd.json_normalize(parsed_df)
1006
- parsed_df2.index = parsed_df.index
1007
- return parsed_df2
1008
-
1009
-
1010
-
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.decomposition import PCA
4
+ from sklearn.cluster import KMeans
5
+ # from cuml.manifold import umap
6
+ # from cuml.cluster import hdbscan
7
+ import umap
8
+ import fast_hdbscan as hdbscan
9
+ import plotly.graph_objects as go
10
+ from sentence_transformers import SentenceTransformer
11
+ import torch
12
+ with torch.no_grad():
13
+ embed_model = SentenceTransformer('embaas/sentence-transformers-e5-large-v2')
14
+ embed_model.to('cuda')
15
+ from sentence_transformers.util import pytorch_cos_sim, pairwise_cos_sim
16
+ #from stqdm.notebook import stqdm
17
+ #stqdm.pandas()
18
+ import logging
19
+ import pandas as pd
20
+ import numpy as np
21
+ from sklearn.decomposition import PCA
22
+ from sklearn.cluster import KMeans
23
+ import plotly.graph_objects as go
24
+ import plotly.express as px
25
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
26
+ import numpy as np
27
+ from Levenshtein import distance
28
+ import logging
29
+ from sklearn.metrics import confusion_matrix
30
+ import seaborn as sns
31
+ import matplotlib.pyplot as plt
32
+ import xgboost as xgb
33
+ from xgboost import plot_importance
34
+ import matplotlib.pyplot as plt
35
+ from sklearn.metrics import accuracy_score, confusion_matrix
36
+ from scipy.stats import chi2_contingency
37
+ import matplotlib.pyplot as plt
38
+ import seaborn as sns
39
+ from statsmodels.graphics.mosaicplot import mosaic
40
+ import pickle
41
+ import pandas as pd
42
+ from sklearn.model_selection import train_test_split
43
+ from sklearn.metrics import confusion_matrix
44
+ import seaborn as sns
45
+ import matplotlib.pyplot as plt
46
+ import xgboost as xgb
47
+ from xgboost import plot_importance
48
+ import matplotlib.pyplot as plt
49
+ from sklearn.metrics import accuracy_score, confusion_matrix
50
+ from scipy.stats import chi2_contingency
51
+ import matplotlib.pyplot as plt
52
+ import seaborn as sns
53
+ from statsmodels.graphics.mosaicplot import mosaic
54
+ from statsmodels.api import stats
55
+ import os
56
+ import time
57
+ import concurrent.futures
58
+ from requests.exceptions import HTTPError
59
+ from stqdm import stqdm
60
+ stqdm.pandas()
61
+ import json
62
+ import pandas as pd
63
+ from openai import OpenAI
64
+ import numpy as np
65
+ import matplotlib.pyplot as plt
66
+ import squarify
67
+ import matplotlib.colors as mcolors
68
+ import textwrap
69
+ import pandas as pd
70
+ import streamlit as st
71
+ st.set_option('deprecation.showPyplotGlobalUse', False)
72
+
73
+
74
+ # Configure logging
75
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
76
+
77
+ class UAPAnalyzer:
78
+ """
79
+ A class for analyzing and clustering textual data within a pandas DataFrame using
80
+ Natural Language Processing (NLP) techniques and machine learning models.
81
+
82
+ Attributes:
83
+ data (pd.DataFrame): The dataset containing textual data for analysis.
84
+ column (str): The name of the column in the DataFrame to be analyzed.
85
+ embeddings (np.ndarray): The vector representations of textual data.
86
+ reduced_embeddings (np.ndarray): The dimensionality-reduced embeddings.
87
+ cluster_labels (np.ndarray): The labels assigned to each data point after clustering.
88
+ cluster_terms (list): The list of terms associated with each cluster.
89
+ tfidf_matrix (sparse matrix): The Term Frequency-Inverse Document Frequency (TF-IDF) matrix.
90
+ models (dict): A dictionary to store trained machine learning models.
91
+ evaluations (dict): A dictionary to store evaluation results of models.
92
+ data_nums (pd.DataFrame): The DataFrame with numerical encoding of categorical data.
93
+ """
94
+
95
+ def __init__(self, data, column, has_embeddings=False):
96
+ """
97
+ Initializes the UAPAnalyzer with a dataset and a specified column for analysis.
98
+
99
+ Args:
100
+ data (pd.DataFrame): The dataset for analysis.
101
+ column (str): The column within the dataset to analyze.
102
+ """
103
+ assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame"
104
+ assert column in data.columns, f"Column '{column}' not found in DataFrame"
105
+ self.has_embeddings = has_embeddings
106
+ self.data = data
107
+ self.column = column
108
+ self.embeddings = None
109
+ self.reduced_embeddings = None
110
+ self.cluster_labels = None
111
+ self.cluster_names = None
112
+ self.cluster_terms = None
113
+ self.cluster_terms_embeddings = None
114
+ self.tfidf_matrix = None
115
+ self.models = {} # To store trained models
116
+ self.evaluations = {} # To store evaluation results
117
+ self.data_nums = None # Encoded numerical data
118
+ self.x_train = None
119
+ self.y_train = None
120
+ self.x_test = None
121
+ self.y_test = None
122
+ self.preds = None
123
+ self.new_dataset = None
124
+ self.model = embed_model
125
+ self.model = self.model.to('cuda')
126
+ #self.cluster_names_ = pd.DataFrame()
127
+
128
+ logging.info("UAPAnalyzer initialized")
129
+
130
+ def preprocess_data(self, trim=False, has_embeddings=False, top_n=32,):
131
+ """
132
+ Preprocesses the data by optionally trimming the dataset to include only the top N labels and extracting embeddings.
133
+
134
+ Args:
135
+ trim (bool): Whether to trim the dataset to include only the top N labels.
136
+ top_n (int): The number of top labels to retain if trimming is enabled.
137
+ """
138
+ logging.info("Preprocessing data")
139
+
140
+ # if trim is True
141
+ if trim:
142
+ # Identify the top labels based on value counts
143
+ top_labels = self.data[self.column].value_counts().nlargest(top_n).index.tolist()
144
+ # Revise the column data, setting values to 'Other' if they are not in the top labels
145
+ self.data[f'{self.column}_revised'] = np.where(self.data[self.column].isin(top_labels), self.data[self.column], 'Other')
146
+ # Convert the column data to string type before passing to _extract_embeddings
147
+ # This is useful especially if the data type of the column is not originally string
148
+ string_data = self.data[f'{self.column}'].astype(str)
149
+ # Extract embeddings from the revised and string-converted column data
150
+ if has_embeddings:
151
+ self.embeddings = self.data['embeddings'].to_list()
152
+ else:
153
+ self.embeddings = self._extract_embeddings(string_data)
154
+ logging.info("Data preprocessing complete")
155
+
156
+
157
+ def _extract_embeddings(self, data_column):
158
+ """
159
+ Extracts embeddings from the given data column.
160
+
161
+ Args:
162
+ data_column (pd.Series): The column from which to extract embeddings.
163
+
164
+ Returns:
165
+ np.ndarray: The extracted embeddings.
166
+ """
167
+ logging.info("Extracting embeddings")
168
+ # convert to str
169
+ return embed_model.encode(data_column.tolist(), show_progress_bar=True)
170
+
171
+ def reduce_dimensionality(self, method='UMAP', n_components=2, **kwargs):
172
+ """
173
+ Reduces the dimensionality of embeddings using specified method.
174
+
175
+ Args:
176
+ method (str): The dimensionality reduction method to use ('UMAP' or 'PCA').
177
+ n_components (int): The number of dimensions to reduce to.
178
+ **kwargs: Additional keyword arguments for the dimensionality reduction method.
179
+ """
180
+ logging.info(f"Reducing dimensionality using {method}")
181
+ if method == 'UMAP':
182
+ reducer = umap.UMAP(n_components=n_components, **kwargs)
183
+ elif method == 'PCA':
184
+ reducer = PCA(n_components=n_components)
185
+ else:
186
+ raise ValueError("Unsupported dimensionality reduction method")
187
+
188
+ self.reduced_embeddings = reducer.fit_transform(self.embeddings)
189
+ logging.info(f"Dimensionality reduced using {method}")
190
+
191
+ def cluster_data(self, method='HDBSCAN', **kwargs):
192
+ """
193
+ Clusters the reduced dimensionality data using the specified clustering method.
194
+
195
+ Args:
196
+ method (str): The clustering method to use ('HDBSCAN' or 'KMeans').
197
+ **kwargs: Additional keyword arguments for the clustering method.
198
+ """
199
+ logging.info(f"Clustering data using {method}")
200
+ if method == 'HDBSCAN':
201
+ clusterer = hdbscan.HDBSCAN(**kwargs)
202
+ elif method == 'KMeans':
203
+ clusterer = KMeans(**kwargs)
204
+ else:
205
+ raise ValueError("Unsupported clustering method")
206
+
207
+ clusterer.fit(self.reduced_embeddings)
208
+ self.cluster_labels = clusterer.labels_
209
+ logging.info(f"Data clustering complete using {method}")
210
+
211
+
212
+ def get_tf_idf_clusters(self, top_n=2):
213
+ """
214
+ Names clusters using the most frequent terms based on TF-IDF analysis.
215
+
216
+ Args:
217
+ top_n (int): The number of top terms to consider for naming each cluster.
218
+ """
219
+ logging.info("Naming clusters based on top TF-IDF terms.")
220
+
221
+ # Ensure data has been clustered
222
+ assert self.cluster_labels is not None, "Data has not been clustered yet."
223
+ vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
224
+
225
+ # Fit the vectorizer to the text data and transform it into a TF-IDF matrix
226
+ tfidf_matrix = vectorizer.fit_transform(self.data[f'{self.column}'].astype(str))
227
+
228
+ # Initialize an empty list to store the cluster terms
229
+ self.cluster_terms = []
230
+
231
+ for cluster_id in np.unique(self.cluster_labels):
232
+ # Skip noise if present (-1 in HDBSCAN)
233
+ if cluster_id == -1:
234
+ continue
235
+
236
+ # Find indices of documents in the current cluster
237
+ indices = np.where(self.cluster_labels == cluster_id)[0]
238
+
239
+ # Compute the mean TF-IDF score for each term in the cluster
240
+ cluster_tfidf_mean = np.mean(tfidf_matrix[indices], axis=0)
241
+
242
+ # Use the matrix directly for indexing if it does not support .toarray()
243
+ # Ensure it's in a format that supports indexing, convert if necessary
244
+ if hasattr(cluster_tfidf_mean, "toarray"):
245
+ dense_mean = cluster_tfidf_mean.toarray().flatten()
246
+ else:
247
+ dense_mean = np.asarray(cluster_tfidf_mean).flatten()
248
+
249
+ # Get the indices of the top_n terms
250
+ top_n_indices = np.argsort(dense_mean)[-top_n:]
251
+
252
+ # Get the corresponding terms for these top indices
253
+ terms = vectorizer.get_feature_names_out()
254
+ top_terms = [terms[i] for i in top_n_indices]
255
+
256
+ # Join the top_n terms with a hyphen
257
+ cluster_name = '-'.join(top_terms)
258
+
259
+ # Append the cluster name to the list
260
+ self.cluster_terms.append(cluster_name)
261
+
262
+ # Convert the list of cluster terms to a categorical data type
263
+ self.cluster_terms = pd.Categorical(self.cluster_terms)
264
+ logging.info("Cluster naming completed.")
265
+
266
+ def merge_similar_clusters(self, distance='cosine', char_diff_threshold = 3, similarity_threshold = 0.92, embeddings = 'SBERT'):
267
+ """
268
+ Merges similar clusters based on cosine similarity of their associated terms.
269
+
270
+ Args:
271
+ similarity_threshold (float): The similarity threshold above which clusters are considered similar enough to merge.
272
+ """
273
+ from collections import defaultdict
274
+ logging.info("Merging similar clusters")
275
+
276
+ # A mapping from cluster names to a set of cluster names to be merged
277
+ merge_mapping = defaultdict(set)
278
+ merge_labels = defaultdict(set)
279
+
280
+ if distance == 'levenshtein':
281
+ distances = {}
282
+ for i, name1 in enumerate(self.cluster_terms):
283
+ for j, name2 in enumerate(self.cluster_terms[i + 1:], start=i + 1):
284
+ dist = distance(name1, name2)
285
+ if dist <= char_diff_threshold:
286
+ logging.info(f"Merging '{name2}' into '{name1}'")
287
+ merge_mapping[name1].add(name2)
288
+
289
+ elif distance == 'cosine':
290
+ self.cluster_terms_embeddings = embed_model.encode(self.cluster_terms)
291
+ cos_sim_matrix = pytorch_cos_sim(self.cluster_terms_embeddings, self.cluster_terms_embeddings)
292
+ for i, name1 in enumerate(self.cluster_terms):
293
+ for j, name2 in enumerate(self.cluster_terms[i + 1:], start=i + 1):
294
+ if cos_sim_matrix[i][j] > similarity_threshold:
295
+ #st.write(f"Merging cluster '{name2}' into cluster '{name1}' based on cosine similarity")
296
+ logging.info(f"Merging cluster '{name2}' into cluster '{name1}' based on cosine similarity")
297
+ merge_mapping[name1].add(name2)
298
+
299
+
300
+ # Flatten the merge mapping to a simple name change mapping
301
+ name_change_mapping = {}
302
+ for cluster_name, merges in merge_mapping.items():
303
+ for merge_name in merges:
304
+ name_change_mapping[merge_name] = cluster_name
305
+
306
+ # Update cluster labels based on name changes
307
+ updated_cluster_terms = []
308
+ original_to_updated_index = {}
309
+ for i, name in enumerate(self.cluster_terms):
310
+ updated_name = name_change_mapping.get(name, name)
311
+ if updated_name not in updated_cluster_terms:
312
+ updated_cluster_terms.append(updated_name)
313
+ original_to_updated_index[i] = len(updated_cluster_terms) - 1
314
+ else:
315
+ updated_index = updated_cluster_terms.index(updated_name)
316
+ original_to_updated_index[i] = updated_index
317
+
318
+ self.cluster_terms = updated_cluster_terms # Update cluster terms with merged names
319
+ self.clusters_labels = np.array([original_to_updated_index[label] for label in self.cluster_labels])
320
+
321
+
322
+ # Update cluster labels according to the new index mapping
323
+ # self.cluster_labels = np.array([original_to_updated_index[label] if label in original_to_updated_index else -1 for label in self.cluster_labels])
324
+ # self.cluster_terms = [self.cluster_terms[original_to_updated_index[label]] if label != -1 else 'Noise' for label in self.cluster_labels]
325
+
326
+ # Log the total number of merges
327
+ total_merges = sum(len(merges) for merges in merge_mapping.values())
328
+ logging.info(f"Total clusters merged: {total_merges}")
329
+
330
+ unique_labels = np.unique(self.cluster_labels)
331
+ label_to_index = {label: index for index, label in enumerate(unique_labels)}
332
+ self.cluster_labels = np.array([label_to_index[label] for label in self.cluster_labels])
333
+ self.cluster_terms = [self.cluster_terms[label_to_index[label]] for label in self.cluster_labels]
334
+
335
+ def merge_similar_clusters2(self, distance='cosine', char_diff_threshold=3, similarity_threshold=0.92):
336
+ logging.info("Merging similar clusters based on distance: {}".format(distance))
337
+ from collections import defaultdict
338
+ merge_mapping = defaultdict(set)
339
+
340
+ if distance == 'levenshtein':
341
+ for i, name1 in enumerate(self.cluster_terms):
342
+ for j, name2 in enumerate(self.cluster_terms[i + 1:], start=i + 1):
343
+ dist = distance(name1, name2)
344
+ if dist <= char_diff_threshold:
345
+ merge_mapping[name1].add(name2)
346
+ logging.info(f"Merging '{name2}' into '{name1}' based on Levenshtein distance")
347
+
348
+ elif distance == 'cosine':
349
+ if self.cluster_terms_embeddings is None:
350
+ self.cluster_terms_embeddings = embed_model.encode(self.cluster_terms)
351
+ cos_sim_matrix = pytorch_cos_sim(self.cluster_terms_embeddings, self.cluster_terms_embeddings)
352
+ for i in range(len(self.cluster_terms)):
353
+ for j in range(i + 1, len(self.cluster_terms)):
354
+ if cos_sim_matrix[i][j] > similarity_threshold:
355
+ merge_mapping[self.cluster_terms[i]].add(self.cluster_terms[j])
356
+ #st.write(f"Merging cluster '{self.cluster_terms[j]}' into cluster '{self.cluster_terms[i]}'")
357
+ logging.info(f"Merging cluster '{self.cluster_terms[j]}' into cluster '{self.cluster_terms[i]}'")
358
+
359
+ self._update_cluster_terms_and_labels(merge_mapping)
360
+
361
+ def _update_cluster_terms_and_labels(self, merge_mapping):
362
+ # Flatten the merge mapping to a simple name change mapping
363
+ name_change_mapping = {old: new for new, olds in merge_mapping.items() for old in olds}
364
+ # Update cluster terms and labels
365
+ unique_new_terms = list(set(name_change_mapping.values()))
366
+ # replace the old terms with the new terms (name2) otherwise, keep the old terms (name1)
367
+ # self.cluster_terms = [name_change_mapping.get(term, term) for term in self.cluster_terms]
368
+ # self.cluster_labels = np.array([unique_new_terms.index(term) if term in unique_new_terms else term for term in self.cluster_terms])
369
+ self.cluster_terms = [name_change_mapping.get(term, term) for term in self.cluster_terms]
370
+ self.cluster_labels = [unique_new_terms.index(term) if term in unique_new_terms else -1 for term in self.cluster_terms]
371
+
372
+ logging.info(f"Total clusters merged: {len(merge_mapping)}")
373
+
374
+
375
+ def cluster_levenshtein(self, cluster_terms, cluster_labels, char_diff_threshold=3):
376
+ from Levenshtein import distance # Make sure to import the correct distance function
377
+
378
+ merge_map = {}
379
+ # Iterate over term pairs and decide on merging based on the distance
380
+ for idx, term1 in enumerate(cluster_terms):
381
+ for jdx, term2 in enumerate(cluster_terms):
382
+ if idx < jdx and distance(term1, term2) <= char_diff_threshold:
383
+ labels_to_merge = [label for label, term_index in enumerate(cluster_labels) if term_index == jdx]
384
+ for label in labels_to_merge:
385
+ merge_map[label] = idx # Map the label to use the term index of term1
386
+ logging.info(f"Merging '{term2}' into '{term1}'")
387
+ st.write(f"Merging '{term2}' into '{term1}'")
388
+ # Update the cluster labels
389
+ updated_cluster_labels = [merge_map.get(label, label) for label in cluster_labels]
390
+ # Update string labels to reflect merged labels
391
+ updated_string_labels = [cluster_terms[label] for label in updated_cluster_labels]
392
+ return updated_string_labels
393
+
394
+ def cluster_cosine(self, cluster_terms, cluster_labels, similarity_threshold):
395
+ from sklearn.metrics.pairwise import cosine_similarity
396
+ cluster_terms_embeddings = embed_model.encode(cluster_terms)
397
+ # Compute cosine similarity matrix in a vectorized form
398
+ cos_sim_matrix = cosine_similarity(cluster_terms_embeddings, cluster_terms_embeddings)
399
+
400
+ merge_map = {}
401
+ n_terms = len(cluster_terms)
402
+ # Iterate only over upper triangular matrix excluding diagonal to avoid redundant computations and self-comparison
403
+ for idx in range(n_terms):
404
+ for jdx in range(idx + 1, n_terms):
405
+ if cos_sim_matrix[idx, jdx] >= similarity_threshold:
406
+ labels_to_merge = [label for label, term_index in enumerate(cluster_labels) if term_index == jdx]
407
+ for label in labels_to_merge:
408
+ merge_map[label] = idx
409
+ st.write(f"Merging '{cluster_terms[jdx]}' into '{cluster_terms[idx]}'")
410
+ logging.info(f"Merging '{cluster_terms[jdx]}' into '{cluster_terms[idx]}'")
411
+ # Update the cluster labels
412
+ updated_cluster_labels = [merge_map.get(label, label) for label in cluster_labels]
413
+ # Update string labels to reflect merged labels
414
+ updated_string_labels = [cluster_terms[label] for label in updated_cluster_labels]
415
+ # make a dataframe with index, cluster label and cluster term
416
+ return updated_string_labels
417
+
418
+ def merge_similar_clusters(self, cluster_terms, cluster_labels, distance_type='cosine', char_diff_threshold=3, similarity_threshold=0.92):
419
+ if distance_type == 'levenshtein':
420
+ return self.cluster_levenshtein(cluster_terms, cluster_labels, char_diff_threshold)
421
+ elif distance_type == 'cosine':
422
+ return self.cluster_cosine(cluster_terms, cluster_labels, similarity_threshold)
423
+
424
+ def plot_embeddings2(self, title=None):
425
+ assert self.reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
426
+ assert self.cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
427
+
428
+ logging.info("Plotting embeddings with TF-IDF colors")
429
+
430
+ fig = go.Figure()
431
+
432
+ unique_cluster_terms = np.unique(self.cluster_terms)
433
+
434
+ for cluster_term in unique_cluster_terms:
435
+ if cluster_term != 'Noise':
436
+ indices = np.where(np.array(self.cluster_terms) == cluster_term)[0]
437
+
438
+ # Plot points in the current cluster
439
+ fig.add_trace(
440
+ go.Scatter(
441
+ x=self.reduced_embeddings[indices, 0],
442
+ y=self.reduced_embeddings[indices, 1],
443
+ mode='markers',
444
+ marker=dict(
445
+ size=5,
446
+ opacity=0.8,
447
+ ),
448
+ name=cluster_term,
449
+ text=self.data[f'{self.column}'].iloc[indices],
450
+ hoverinfo='text',
451
+ )
452
+ )
453
+ else:
454
+ # Plot noise points differently if needed
455
+ fig.add_trace(
456
+ go.Scatter(
457
+ x=self.reduced_embeddings[indices, 0],
458
+ y=self.reduced_embeddings[indices, 1],
459
+ mode='markers',
460
+ marker=dict(
461
+ size=5,
462
+ opacity=0.5,
463
+ color='grey'
464
+ ),
465
+ name='Noise',
466
+ text=[self.data[f'{self.column}'][i] for i in indices], # Adjusted for potential pandas use
467
+ hoverinfo='text',
468
+ )
469
+ )
470
+ # else:
471
+ # indices = np.where(np.array(self.cluster_terms) == 'Noise')[0]
472
+
473
+ # # Plot noise points
474
+ # fig.add_trace(
475
+ # go.Scatter(
476
+ # x=self.reduced_embeddings[indices, 0],
477
+ # y=self.reduced_embeddings[indices, 1],
478
+ # mode='markers',
479
+ # marker=dict(
480
+ # size=5,
481
+ # opacity=0.8,
482
+ # ),
483
+ # name='Noise',
484
+ # text=self.data[f'{self.column}'].iloc[indices],
485
+ # hoverinfo='text',
486
+ # )
487
+ # )
488
+
489
+ fig.update_layout(title=title, showlegend=True, legend_title_text='Top TF-IDF Terms')
490
+ #return fig
491
+ st.plotly_chart(fig, use_container_width=True)
492
+ #fig.show()
493
+ #logging.info("Embeddings plotted with TF-IDF colors")
494
+
495
+ def plot_embeddings3(self, title=None):
496
+ assert self.reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
497
+ assert self.cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
498
+
499
+ logging.info("Plotting embeddings with TF-IDF colors")
500
+
501
+ fig = go.Figure()
502
+
503
+ unique_cluster_terms = np.unique(self.cluster_terms)
504
+
505
+ terms_order = {term: i for i, term in enumerate(np.unique(self.cluster_terms, return_index=True)[0])}
506
+ #indices = np.argsort([terms_order[term] for term in self.cluster_terms])
507
+
508
+ # Handling color assignment, especially for noise
509
+ colors = {term: ('grey' if term == 'Noise' else None) for term in unique_cluster_terms}
510
+ color_map = px.colors.qualitative.Plotly # Default color map from Plotly Express for consistency
511
+
512
+ # Apply a custom color map, handling 'Noise' specifically
513
+ color_idx = 0
514
+ for cluster_term in unique_cluster_terms:
515
+ indices = np.where(np.array(self.cluster_terms) == cluster_term)[0]
516
+ if cluster_term != 'Noise':
517
+ marker_color = color_map[color_idx % len(color_map)]
518
+ color_idx += 1
519
+ else:
520
+ marker_color = 'grey'
521
+
522
+ fig.add_trace(
523
+ go.Scatter(
524
+ x=self.reduced_embeddings[indices, 0],
525
+ y=self.reduced_embeddings[indices, 1],
526
+ mode='markers',
527
+ marker=dict(
528
+ size=5,
529
+ opacity=(0.5 if cluster_term == 'Noise' else 0.8),
530
+ color=marker_color
531
+ ),
532
+ name=cluster_term,
533
+ text=self.data[f'{self.column}'].iloc[indices],
534
+ hoverinfo='text'
535
+ )
536
+ )
537
+ fig.data = sorted(fig.data, key=lambda trace: terms_order[trace.name])
538
+ fig.update_layout(title=title if title else "Embeddings Visualized", showlegend=True, legend_title_text='Top TF-IDF Terms')
539
+ st.plotly_chart(fig, use_container_width=True)
540
+
541
+
542
+ def plot_embeddings(self, title=None):
543
+ """
544
+ Plots the reduced dimensionality embeddings with clusters indicated.
545
+
546
+ Args:
547
+ title (str): The title of the plot.
548
+ """
549
+ # Ensure dimensionality reduction and TF-IDF based cluster naming have been performed
550
+ assert self.reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
551
+ assert self.cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
552
+
553
+ logging.info("Plotting embeddings with TF-IDF colors")
554
+
555
+ fig = go.Figure()
556
+
557
+ #for i, term in enumerate(self.cluster_terms):
558
+ # Indices of points in the current cluster
559
+ #unique_cluster_ids = np.unique(self.cluster_labels[self.cluster_labels != -1]) # Exclude noise
560
+ unique_cluster_terms = np.unique(self.cluster_terms)
561
+ unique_cluster_labels = np.unique(self.cluster_labels)
562
+
563
+ for i, (cluster_id, cluster_terms) in enumerate(zip(unique_cluster_labels, unique_cluster_terms)):
564
+ indices = np.where(self.cluster_labels == cluster_id)[0]
565
+ #indices = np.where(self.cluster_labels == i)[0]
566
+
567
+ # Plot points in the current cluster
568
+ fig.add_trace(
569
+ go.Scatter(
570
+ x=self.reduced_embeddings[indices, 0],
571
+ y=self.reduced_embeddings[indices, 1],
572
+ mode='markers',
573
+ marker=dict(
574
+ #color=i,
575
+ #colorscale='rainbow',
576
+ size=5,
577
+ opacity=0.8,
578
+ ),
579
+ name=cluster_terms,
580
+ text=self.data[f'{self.column}'].iloc[indices],
581
+ hoverinfo='text',
582
+ )
583
+ )
584
+
585
+
586
+ fig.update_layout(title=title, showlegend=True, legend_title_text='Top TF-IDF Terms')
587
+ st.plotly_chart(fig, use_container_width=True)
588
+ logging.info("Embeddings plotted with TF-IDF colors")
589
+
590
+ def plot_embeddings4(self, title=None, cluster_terms=None, cluster_labels=None, reduced_embeddings=None, column=None, data=None):
591
+ """
592
+ Plots the reduced dimensionality embeddings with clusters indicated.
593
+
594
+ Args:
595
+ title (str): The title of the plot.
596
+ """
597
+ # Ensure dimensionality reduction and TF-IDF based cluster naming have been performed
598
+ assert reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
599
+ assert cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
600
+
601
+ logging.info("Plotting embeddings with TF-IDF colors")
602
+
603
+ fig = go.Figure()
604
+
605
+ # Determine unique cluster IDs and terms, and ensure consistent color mapping
606
+ unique_cluster_ids = np.unique(cluster_labels)
607
+ unique_cluster_terms = [cluster_terms[i] for i in unique_cluster_ids]#if i != -1] # Exclude noise by ID
608
+
609
+ color_map = px.colors.qualitative.Plotly # Using Plotly Express's qualitative colors for consistency
610
+ color_idx = 0
611
+
612
+ # Map each cluster ID to a color
613
+ cluster_colors = {}
614
+ for cid in unique_cluster_ids:
615
+ #if cid != -1: # Exclude noise
616
+ cluster_colors[cid] = color_map[color_idx % len(color_map)]
617
+ color_idx += 1
618
+ #else:
619
+ # cluster_colors[cid] = 'grey' # Noise or outliers in grey
620
+
621
+ for cluster_id, cluster_term in zip(unique_cluster_ids, unique_cluster_terms):
622
+ indices = np.where(cluster_labels == cluster_id)[0]
623
+ fig.add_trace(
624
+ go.Scatter(
625
+ x=reduced_embeddings[indices, 0],
626
+ y=reduced_embeddings[indices, 1],
627
+ mode='markers',
628
+ marker=dict(
629
+ color=cluster_colors[cluster_id],
630
+ size=5,
631
+ opacity=0.8#if cluster_id != -1 else 0.5,
632
+ ),
633
+ name=cluster_term,
634
+ text=data[column].iloc[indices], # Use the original column for hover text
635
+ hoverinfo='text',
636
+ )
637
+ )
638
+
639
+ fig.update_layout(
640
+ title=title if title else "Embeddings Visualized",
641
+ showlegend=True,
642
+ legend_title_text='Top TF-IDF Terms',
643
+ legend=dict(
644
+ traceorder='normal', # 'normal' or 'reversed'; ensures that traces appear in the order they are added
645
+ itemsizing='constant'
646
+ )
647
+ )
648
+ st.plotly_chart(fig, use_container_width=True)
649
+ logging.info("Embeddings plotted with TF-IDF colors")
650
+
651
+
652
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
653
+
654
+ def analyze_and_predict(data, analyzers, col_names):
655
+ """
656
+ Performs analysis on the data using provided analyzers and makes predictions on specified columns.
657
+
658
+ Args:
659
+ data (pd.DataFrame): The dataset for analysis.
660
+ analyzers (list): A list of UAPAnalyzer instances.
661
+ col_names (list): Column names to be analyzed and predicted.
662
+ """
663
+ new_data = pd.DataFrame()
664
+ for i, (column, analyzer) in enumerate(zip(col_names, analyzers)):
665
+ new_data[f'Analyzer_{column}'] = analyzer.__dict__['cluster_terms']
666
+ logging.info(f"Cluster terms extracted for {column}")
667
+
668
+ new_data = new_data.fillna('null').astype('category')
669
+ data_nums = new_data.apply(lambda x: x.cat.codes)
670
+
671
+ for col in data_nums.columns:
672
+ try:
673
+ categories = new_data[col].cat.categories
674
+ x_train, x_test, y_train, y_test = train_test_split(data_nums.drop(columns=[col]), data_nums[col], test_size=0.2, random_state=42)
675
+ bst, accuracy, preds = train_xgboost(x_train, y_train, x_test, y_test, len(categories))
676
+ plot_results(new_data, bst, x_test, y_test, preds, categories, accuracy, col)
677
+ except Exception as e:
678
+ logging.error(f"Error processing {col}: {e}")
679
+ return new_data
680
+
681
+ def train_xgboost(x_train, y_train, x_test, y_test, num_classes):
682
+ """
683
+ Trains an XGBoost model and evaluates its performance.
684
+
685
+ Args:
686
+ x_train (pd.DataFrame): Training features.
687
+ y_train (pd.Series): Training labels.
688
+ x_test (pd.DataFrame): Test features.
689
+ y_test (pd.Series): Test labels.
690
+ num_classes (int): The number of unique classes in the target variable.
691
+
692
+ Returns:
693
+ bst (Booster): The trained XGBoost model.
694
+ accuracy (float): The accuracy of the model on the test set.
695
+ """
696
+ dtrain = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
697
+ dtest = xgb.DMatrix(x_test, label=y_test)
698
+
699
+ params = {'device':'cuda', 'objective': 'multi:softmax', 'num_class': num_classes, 'max_depth': 6, 'eta': 0.3}
700
+ num_round = 100
701
+ bst = xgb.train(dtrain=dtrain, params=params, num_boost_round=num_round)
702
+ preds = bst.predict(dtest)
703
+ accuracy = accuracy_score(y_test, preds)
704
+
705
+ logging.info(f"XGBoost trained with accuracy: {accuracy:.2f}")
706
+ return bst, accuracy, preds
707
+
708
+ def plot_results(new_data, bst, x_test, y_test, preds, categories, accuracy, col):
709
+ """
710
+ Plots the feature importance, confusion matrix, and contingency table.
711
+
712
+ Args:
713
+ bst (Booster): The trained XGBoost model.
714
+ x_test (pd.DataFrame): Test features.
715
+ y_test (pd.Series): Test labels.
716
+ preds (np.array): Predictions made by the model.
717
+ categories (Index): Category names for the target variable.
718
+ accuracy (float): The accuracy of the model on the test set.
719
+ col (str): The target column name being analyzed and predicted.
720
+ """
721
+ fig, axs = plt.subplots(1, 3, figsize=(25, 5), dpi=300)
722
+ fig.suptitle(f'{col.split(sep=".")[-1]} prediction', fontsize=35)
723
+
724
+ plot_importance(bst, ax=axs[0], importance_type='gain', show_values=False)
725
+ conf_matrix = confusion_matrix(y_test, preds)
726
+ sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=categories, yticklabels=categories, ax=axs[1])
727
+ axs[1].set_title(f'Confusion Matrix\nAccuracy: {accuracy * 100:.2f}%')
728
+ # make axes rotated
729
+ axs[1].set_yticklabels(axs[1].get_yticklabels(), rotation=30, ha='right')
730
+ sorted_features = sorted(bst.get_score(importance_type="gain").items(), key=lambda x: x[1], reverse=True)
731
+ # The most important feature is the first element in the sorted list
732
+ most_important_feature = sorted_features[0][0]
733
+ # Create a contingency table
734
+ contingency_table = pd.crosstab(new_data[col], new_data[most_important_feature])
735
+
736
+ # resid pearson is used to calculate the residuals, which
737
+ table = stats.Table(contingency_table).resid_pearson
738
+ #print(table)
739
+ # Perform the chi-squared test
740
+ chi2, p, dof, expected = chi2_contingency(contingency_table)
741
+ # Print the results
742
+ print(f"Chi-squared test for {col} and {most_important_feature}: p-value = {p}")
743
+
744
+ sns.heatmap(table, annot=True, cmap='Greens', ax=axs[2])
745
+ # make axis rotated
746
+ axs[2].set_yticklabels(axs[2].get_yticklabels(), rotation=30, ha='right')
747
+ axs[2].set_title(f'Contingency Table between {col.split(sep=".")[-1]} and {most_important_feature.split(sep=".")[-1]}\np-value = {p}')
748
+
749
+ plt.tight_layout()
750
+ #plt.savefig(f"{col}_{accuracy:.2f}_prediction_XGB.jpeg", dpi=300)
751
+ return plt
752
+
753
+ def cramers_v(confusion_matrix):
754
+ """Calculate Cramer's V statistic for categorical-categorical association."""
755
+ chi2 = chi2_contingency(confusion_matrix)[0]
756
+ n = confusion_matrix.sum().sum()
757
+ phi2 = chi2 / n
758
+ r, k = confusion_matrix.shape
759
+ phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
760
+ r_corr = r - ((r-1)**2)/(n-1)
761
+ k_corr = k - ((k-1)**2)/(n-1)
762
+ return np.sqrt(phi2corr / min((k_corr-1), (r_corr-1)))
763
+
764
+ def plot_cramers_v_heatmap(data, significance_level=0.05):
765
+ """Plot heatmap of Cramer's V statistic for each pair of categorical variables in a DataFrame."""
766
+ # Initialize a DataFrame to store Cramer's V values
767
+ cramers_v_df = pd.DataFrame(index=data.columns, columns=data.columns, data=np.nan)
768
+
769
+ # Compute Cramer's V for each pair of columns
770
+ for col1 in data.columns:
771
+ for col2 in data.columns:
772
+ if col1 != col2: # Avoid self-comparison
773
+ confusion_matrix = pd.crosstab(data[col1], data[col2])
774
+ chi2, p, dof, expected = chi2_contingency(confusion_matrix)
775
+ # Check if the p-value is less than the significance level
776
+ #if p < significance_level:
777
+ # cramers_v_df.at[col1, col2] = cramers_v(confusion_matrix)
778
+ # alternatively, you can use the following line to include all pairs
779
+ cramers_v_df.at[col1, col2] = cramers_v(confusion_matrix)
780
+
781
+ # Plot the heatmap
782
+ plt.figure(figsize=(12, 10), dpi=200)
783
+ mask = np.triu(np.ones_like(cramers_v_df, dtype=bool)) # Mask for the upper triangle
784
+ # make a max and min of the cmap
785
+ sns.heatmap(cramers_v_df, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, mask=mask, square=True)
786
+ plt.title(f"Heatmap of Cramér's V (p < {significance_level})")
787
+ return plt
788
+
789
+
790
+ class UAPVisualizer:
791
+ def __init__(self, data=None):
792
+ pass # Initialization can be added if needed
793
+
794
+ def analyze_and_predict(self, data, analyzers, col_names):
795
+ new_data = pd.DataFrame()
796
+ for i, (column, analyzer) in enumerate(zip(col_names, analyzers)):
797
+ new_data[f'Analyzer_{column}'] = analyzer.__dict__['cluster_terms']
798
+ print(f"Cluster terms extracted for {column}")
799
+
800
+ new_data = new_data.fillna('null').astype('category')
801
+ data_nums = new_data.apply(lambda x: x.cat.codes)
802
+
803
+ for col in data_nums.columns:
804
+ try:
805
+ categories = new_data[col].cat.categories
806
+ x_train, x_test, y_train, y_test = train_test_split(data_nums.drop(columns=[col]), data_nums[col], test_size=0.2, random_state=42)
807
+ bst, accuracy, preds = self.train_xgboost(x_train, y_train, x_test, y_test, len(categories))
808
+ self.plot_results(new_data, bst, x_test, y_test, preds, categories, accuracy, col)
809
+ except Exception as e:
810
+ print(f"Error processing {col}: {e}")
811
+
812
+ def train_xgboost(self, x_train, y_train, x_test, y_test, num_classes):
813
+ dtrain = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
814
+ dtest = xgb.DMatrix(x_test, label=y_test)
815
+
816
+ params = {'objective': 'multi:softmax', 'num_class': num_classes, 'max_depth': 6, 'eta': 0.3}
817
+ num_round = 100
818
+ bst = xgb.train(dtrain=dtrain, params=params, num_boost_round=num_round)
819
+ preds = bst.predict(dtest)
820
+ accuracy = accuracy_score(y_test, preds)
821
+
822
+ print(f"XGBoost trained with accuracy: {accuracy:.2f}")
823
+ return bst, accuracy, preds
824
+
825
+ def plot_results(self, new_data, bst, x_test, y_test, preds, categories, accuracy, col):
826
+ fig, axs = plt.subplots(1, 3, figsize=(25, 5))
827
+ fig.suptitle(f'{col.split(sep=".")[-1]} prediction', fontsize=35)
828
+
829
+ plot_importance(bst, ax=axs[0], importance_type='gain', show_values=False)
830
+ conf_matrix = confusion_matrix(y_test, preds)
831
+ sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=categories, yticklabels=categories, ax=axs[1])
832
+ axs[1].set_title(f'Confusion Matrix\nAccuracy: {accuracy * 100:.2f}%')
833
+
834
+ sorted_features = sorted(bst.get_score(importance_type="gain").items(), key=lambda x: x[1], reverse=True)
835
+ most_important_feature = sorted_features[0][0]
836
+ contingency_table = pd.crosstab(new_data[col], new_data[most_important_feature])
837
+ chi2, p, dof, expected = chi2_contingency(contingency_table)
838
+ print(f"Chi-squared test for {col} and {most_important_feature}: p-value = {p}")
839
+
840
+ sns.heatmap(contingency_table, annot=True, cmap='Greens', ax=axs[2])
841
+ axs[2].set_title(f'Contingency Table between {col.split(sep=".")[-1]} and {most_important_feature.split(sep=".")[-1]}\np-value = {p}')
842
+
843
+ plt.tight_layout()
844
+ plt.savefig(f"{col}_{accuracy:.2f}_prediction_XGB.jpeg", dpi=300)
845
+ plt.show()
846
+
847
+ @staticmethod
848
+ def cramers_v(confusion_matrix):
849
+ chi2 = chi2_contingency(confusion_matrix)[0]
850
+ n = confusion_matrix.sum().sum()
851
+ phi2 = chi2 / n
852
+ r, k = confusion_matrix.shape
853
+ phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
854
+ r_corr = r - ((r-1)**2)/(n-1)
855
+ k_corr = k - ((k-1)**2)/(n-1)
856
+ return np.sqrt(phi2corr / min((k_corr-1), (r_corr-1)))
857
+
858
+ def plot_cramers_v_heatmap(self, data, significance_level=0.05):
859
+ cramers_v_df = pd.DataFrame(index=data.columns, columns=data.columns, data=np.nan)
860
+
861
+ for col1 in data.columns:
862
+ for col2 in data.columns:
863
+ if col1 != col2:
864
+ confusion_matrix = pd.crosstab(data[col1], data[col2])
865
+ chi2, p, dof, expected = chi2_contingency(confusion_matrix)
866
+ if p < significance_level:
867
+ cramers_v_df.at[col1, col2] = UAPVisualizer.cramers_v(confusion_matrix)
868
+
869
+ plt.figure(figsize=(10, 8)),# facecolor="black")
870
+ mask = np.triu(np.ones_like(cramers_v_df, dtype=bool))
871
+ #sns.set_theme(style="dark", rc={"axes.facecolor": "black", "grid.color": "white", "xtick.color": "white", "ytick.color": "white", "axes.labelcolor": "white", "axes.titlecolor": "white"})
872
+ # ax = sns.heatmap(cramers_v_df, annot=True, fmt=".1f", linewidths=.5, linecolor='white', cmap='coolwarm', annot_kws={"color":"white"}, cbar=True, mask=mask, square=True)
873
+ # Customizing the color of the ticks and labels to white
874
+ # plt.xticks(color='white')
875
+ # plt.yticks(color='white')
876
+ sns.heatmap(cramers_v_df, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, mask=mask, square=True)
877
+ plt.title(f"Heatmap of Cramér's V (p < {significance_level})")
878
+ plt.show()
879
+
880
+
881
+ def plot_treemap(self, df, column, top_n=32):
882
+ # Get the value counts and the top N labels
883
+ value_counts = df[column].value_counts()
884
+ top_labels = value_counts.iloc[:top_n].index
885
+
886
+
887
+ # Use np.where to replace all values not in the top N with 'Other'
888
+ revised_column = f'{column}_revised'
889
+ df[revised_column] = np.where(df[column].isin(top_labels), df[column], 'Other')
890
+
891
+ # Get the value counts including the 'Other' category
892
+ sizes = df[revised_column].value_counts().values
893
+ labels = df[revised_column].value_counts().index
894
+
895
+ # Get a gradient of colors
896
+ colors = list(mcolors.TABLEAU_COLORS.values())
897
+
898
+ # Get % of each category
899
+ percents = sizes / sizes.sum()
900
+
901
+ # Prepare labels with percentages
902
+ labels = [f'{label}\n {percent:.1%}' for label, percent in zip(labels, percents)]
903
+
904
+ # Plot the treemap
905
+ squarify.plot(sizes=sizes, label=labels, alpha=0.7, pad=True, color=colors, text_kwargs={'fontsize': 10})
906
+
907
+ ax = plt.gca()
908
+
909
+ # Iterate over text elements and rectangles (patches) in the axes for color adjustment
910
+ for text, rect in zip(ax.texts, ax.patches):
911
+ background_color = rect.get_facecolor()
912
+ r, g, b, _ = mcolors.to_rgba(background_color)
913
+ brightness = np.average([r, g, b])
914
+ text.set_color('white' if brightness < 0.5 else 'black')
915
+
916
+ # Adjust font size based on rectangle's area and wrap long text
917
+ coef = 0.8
918
+ font_size = np.sqrt(rect.get_width() * rect.get_height()) * coef
919
+ text.set_fontsize(font_size)
920
+ wrapped_text = textwrap.fill(text.get_text(), width=20)
921
+ text.set_text(wrapped_text)
922
+
923
+ plt.axis('off')
924
+ plt.gca().invert_yaxis()
925
+ plt.gcf().set_size_inches(20, 12)
926
+ plt.show()
927
+
928
+
929
+
930
+
931
+ class UAPParser:
932
+ def __init__(self, api_key, model="gpt-3.5-turbo-0125", col=None, format_long=None):
933
+ os.environ['OPENAI_API_KEY'] = api_key
934
+ self.client = OpenAI()
935
+ self.model = model
936
+ self.responses = {}
937
+ self.col = None
938
+
939
+ def fetch_response(self, description, format_long):
940
+ INITIAL_WAIT_TIME = 5
941
+ MAX_WAIT_TIME = 600
942
+ MAX_RETRIES = 10
943
+
944
+ wait_time = INITIAL_WAIT_TIME
945
+ for attempt in range(MAX_RETRIES):
946
+ try:
947
+ response = self.client.chat.completions.create(
948
+ model=self.model,
949
+ response_format={"type": "json_object"},
950
+ messages=[
951
+ {"role": "system", "content": "You are a helpful assistant which is tasked to help parse data."},
952
+ {"role": "user", "content": f'Input report: {description}\n\n Parse data following this json structure; leave missing data empty: {format_long} Output:'}
953
+ ]
954
+ )
955
+ return response
956
+ except HTTPError as e:
957
+ if 'TooManyRequests' in str(e):
958
+ time.sleep(wait_time)
959
+ wait_time = min(wait_time * 2, MAX_WAIT_TIME) # Exponential backoff
960
+ else:
961
+ raise
962
+ except Exception as e:
963
+ print(f"Unexpected error: {e}")
964
+ break
965
+
966
+ return None # Return None if all retries fail
967
+
968
+ def process_descriptions(self, descriptions, format_long, max_workers=32):
969
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
970
+ future_to_desc = {executor.submit(self.fetch_response, desc, format_long): desc for desc in descriptions}
971
+
972
+ for future in stqdm(concurrent.futures.as_completed(future_to_desc), total=len(descriptions)):
973
+ desc = future_to_desc[future]
974
+ try:
975
+ response = future.result()
976
+ response_text = response.choices[0].message.content if response else None
977
+ if response_text:
978
+ self.responses[desc] = response_text
979
+ except Exception as exc:
980
+ print(f'Error occurred for description {desc}: {exc}')
981
+
982
+ def parse_responses(self):
983
+ parsed_responses = {}
984
+ not_parsed = 0
985
+ try:
986
+ for k, v in self.responses.items():
987
+ try:
988
+ parsed_responses[k] = json.loads(v)
989
+ except:
990
+ try:
991
+ parsed_responses[k] = json.loads(v.replace("'", '"'))
992
+ except:
993
+ not_parsed += 1
994
+ except Exception as e:
995
+ print(f"Error parsing responses: {e}")
996
+
997
+ print(f"Number of unparsed responses: {not_parsed}")
998
+ print(f"Number of parsed responses: {len(parsed_responses)}")
999
+ return parsed_responses
1000
+
1001
+ def responses_to_df(self, col, parsed_responses):
1002
+ parsed_df = pd.DataFrame(parsed_responses).T
1003
+ if col is not None:
1004
+ parsed_df2 = pd.json_normalize(parsed_df[col])
1005
+ parsed_df2.index = parsed_df.index
1006
+ else:
1007
+ parsed_df2 = pd.json_normalize(parsed_df)
1008
+ parsed_df2.index = parsed_df.index
1009
+ return parsed_df2
1010
+
1011
+
1012
+