import networkx as nx from sklearn.cluster import HDBSCAN import matplotlib.pyplot as plt import numpy as np from sklearn.manifold import TSNE import umap from sklearn.cluster import KMeans from adjustText import adjust_text from constants import high_level_families, primary_families_branches def filter_languages_by_families(matrix, languages, families): """ Filters the languages based on their families. Parameters: - languages: list of languages to filter. - families: list of families to include. Returns: - filtered_languages: list of languages that belong to the specified families. """ filtered_languages = [ (i, lang) for i, lang in enumerate(languages) if high_level_families[lang] in families ] filtered_indices = [i for i, lang in filtered_languages] filtered_languages = [lang for i, lang in filtered_languages] filtered_matrix = matrix[np.ix_(filtered_indices, filtered_indices)] return filtered_matrix, filtered_languages def get_dynamic_color_map(n_colors): """ Generates a dynamic color map with the specified number of colors. Parameters: - n_colors: int, the number of distinct colors required. Returns: - color_map: list of RGB tuples representing the colors. """ cmap = plt.get_cmap("tab20") if n_colors <= 20 else plt.get_cmap("hsv") color_map = [cmap(i / n_colors) for i in range(n_colors)] return color_map def cluster_languages_by_families(languages): lang_families = [high_level_families[lang] for lang in languages] legend = sorted(set(lang_families)) clusters = [legend.index(family) for family in lang_families] return clusters, legend def cluster_languages_by_subfamilies(languages): labels = [ high_level_families[lang] + f" ({primary_families_branches[lang]})" for lang in languages ] legend = sorted(set(labels)) clusters = [legend.index(family) for family in labels] return clusters, legend def plot_mst( matrix, languages, clusters, legend=None, fig_size=(20, 20), ): """ Plots a Minimum Spanning Tree (MST) from a given distance matrix, node labels, and cluster assignments. Parameters: - dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between nodes. - labels: list of length N containing the labels for each node. - clusters: list of length N containing the cluster assignment (or ID) for each node. """ # Create an empty undirected graph G = nx.Graph() # Number of nodes N = len(languages) # Add edges to the graph from the distance matrix. # Only iterate over the upper triangle of the matrix (i < j) for i in range(N): for j in range(i + 1, N): G.add_edge(i, j, weight=matrix[i, j]) # Compute the Minimum Spanning Tree using NetworkX's built-in function. mst = nx.minimum_spanning_tree(G) # Choose a layout for the MST. Here we use Kamada-Kawai layout which considers edge weights. pos = nx.kamada_kawai_layout(mst, weight="weight") # Map each cluster to a color unique_clusters = sorted(set(clusters)) cmap = get_dynamic_color_map(len(unique_clusters)) cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} node_colors = [cluster_colors.get(cluster) for cluster in clusters] # Create a figure for plotting. fig, ax = plt.subplots(figsize=fig_size) # Draw the MST edges. nx.draw_networkx_edges(mst, pos, edge_color="gray", ax=ax) # Draw the nodes with colors corresponding to their clusters. nx.draw_networkx_nodes( mst, pos, node_color=node_colors, node_size=100, ax=ax, alpha=0.7 ) # Instead of directly drawing labels, we create text objects to adjust them later texts = [] for i, label in enumerate(languages): x, y = pos[i] texts.append(ax.text(x, y, label, fontsize=10)) # Adjust text labels to minimize overlap. # The arrowprops argument can draw arrows from labels to nodes if desired. adjust_text(texts, expand_text=(1.05, 1.2)) # Add a legend for clusters if legend is None: legend = {cluster: str(cluster) for cluster in unique_clusters} legend_handles = [ plt.Line2D( [0], [0], marker="o", color="w", markerfacecolor=cluster_colors[cluster], markersize=10, alpha=0.7, label=legend[cluster], ) for cluster in unique_clusters ] ax.legend(handles=legend_handles, title="Clusters", loc="best") # Remove axis for clarity. ax.axis("off") # ax.set_title(f"Minimum Spanning Tree of Languages ({'Average' if use_average else f'{model}, {dataset}'})") return fig def cluster_languages_kmeans(dist_matrix, languages, n_clusters=5): """ Clusters languages using a distance matrix and KMeans. Parameters: - dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between languages. - n_clusters: int, the number of clusters to form. Returns: - filtered_matrix: 2D NumPy array of the filtered distance matrix. - filtered_languages: list of filtered languages. - filtered_clusters: list of filtered cluster assignments. """ # Perform clustering using KMeans kmeans_model = KMeans(n_clusters=n_clusters, random_state=23) clusters = kmeans_model.fit_predict(dist_matrix) # # Count the number of elements in each cluster # cluster_counts = np.bincount(clusters) # # Identify clusters with more than 1 element # valid_clusters = np.where(cluster_counts > 1)[0] # # Filter out points belonging to clusters with only 1 element # valid_indices = np.isin(clusters, valid_clusters) # filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)] # filtered_languages = np.array(languages)[valid_indices] # filtered_clusters = np.array(clusters)[valid_indices] # return filtered_matrix, filtered_languages, filtered_clusters return dist_matrix, languages, clusters def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2): """ Clusters languages using a distance matrix and HDBSCAN. Parameters: - dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between languages. - min_cluster_size: int, the minimum size of clusters. Returns: - clusters: list of length N containing the cluster assignment (or ID) for each language. """ # Perform clustering using HDBSCAN with the precomputed distance matrix clustering_model = HDBSCAN(metric="precomputed", min_cluster_size=min_cluster_size) clusters = clustering_model.fit_predict(dist_matrix) # Filter out points belonging to cluster -1 using NumPy valid_indices = np.where(clusters != -1)[0] filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)] filtered_languages = np.array(languages)[valid_indices] filtered_clusters = np.array(clusters)[valid_indices] return filtered_matrix, filtered_languages, filtered_clusters def plot_distances_tsne( matrix, languages, clusters, legend=None, fig_size=(16, 12), ): """ Plots all languages from the distances matrix using t-SNE and colors them by clusters. """ tsne = TSNE(n_components=2, random_state=23, metric="precomputed", init="random") tsne_results = tsne.fit_transform(matrix) # Map each cluster to a color unique_clusters = sorted(set(clusters)) cmap = get_dynamic_color_map(len(unique_clusters)) cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} fig, ax = plt.subplots(figsize=fig_size) scatter = ax.scatter( tsne_results[:, 0], tsne_results[:, 1], c=[cluster_colors[cluster] for cluster in clusters], alpha=0.7, ) # for i, lang in enumerate(languages): # ax.text(tsne_results[i, 0], tsne_results[i, 1], lang, fontsize=8, alpha=0.8) # Instead of directly drawing labels, we create text objects to adjust them later texts = [] for i, label in enumerate(languages): x, y = tsne_results[i, 0], tsne_results[i, 1] texts.append(ax.text(x, y, label, fontsize=10)) # Adjust text labels to minimize overlap. # The arrowprops argument can draw arrows from labels to nodes if desired. adjust_text(texts, expand_text=(1.05, 1.2)) # Add a legend for clusters if legend is None: legend = {cluster: str(cluster) for cluster in unique_clusters} legend_handles = [ plt.Line2D( [0], [0], marker="o", color="w", markerfacecolor=cluster_colors[cluster], markersize=10, label=legend[cluster], ) for cluster in unique_clusters ] ax.legend(handles=legend_handles, title="Clusters", loc="best") # ax.set_title( # f"t-SNE Visualization of Language Distances ({'Average' if use_average else f'{model}, {dataset}'})" # ) # ax.set_xlabel("t-SNE Dimension 1") # ax.set_ylabel("t-SNE Dimension 2") ax.axis("off") return fig def plot_distances_umap( matrix, languages, clusters, legend=None, fig_size=(16, 12), ): """ Plots all languages from the distances matrix using UMAP and colors them by clusters. """ umap_model = umap.UMAP(metric="precomputed", random_state=23) umap_results = umap_model.fit_transform(matrix) # Map each cluster to a color unique_clusters = sorted(set(clusters)) cmap = get_dynamic_color_map(len(unique_clusters)) cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} fig, ax = plt.subplots(figsize=fig_size) scatter = ax.scatter( umap_results[:, 0], umap_results[:, 1], c=[cluster_colors[cluster] for cluster in clusters], alpha=0.7, ) # for i, lang in enumerate(languages): # ax.text(umap_results[i, 0], umap_results[i, 1], lang, fontsize=8, alpha=0.8) # Instead of directly drawing labels, we create text objects to adjust them later texts = [] for i, label in enumerate(languages): x, y = umap_results[i, 0], umap_results[i, 1] texts.append(ax.text(x, y, label, fontsize=10)) # Adjust text labels to minimize overlap. # The arrowprops argument can draw arrows from labels to nodes if desired. adjust_text(texts, expand_text=(1.05, 1.2)) # Add a legend for clusters if legend is None: legend = {cluster: str(cluster) for cluster in unique_clusters} legend_handles = [ plt.Line2D( [0], [0], marker="o", color="w", markerfacecolor=cluster_colors[cluster], markersize=10, label=legend[cluster], ) for cluster in unique_clusters ] ax.legend(handles=legend_handles, title="Clusters", loc="best") # ax.set_title( # f"UMAP Visualization of Language Distances ({'Average' if use_average else f'{model}, {dataset}'})" # ) # ax.set_xlabel("UMAP Dimension 1") # ax.set_ylabel("UMAP Dimension 2") ax.axis("off") return fig