File size: 2,157 Bytes
7b3478d
 
 
 
 
 
 
 
f30d304
05fa263
f30d304
7b3478d
05fa263
7b3478d
05fa263
 
7b3478d
 
 
 
 
 
 
 
 
05fa263
7b3478d
 
 
05fa263
 
 
7b3478d
 
05fa263
7b3478d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05fa263
7b3478d
 
 
 
 
05fa263
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import streamlit as st
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import umap
import pandas as pd
from word2vec import *
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.manifold import TSNE


def make_3d_plot_tSNE(vectors_list, word, time_slice_model):
    """
        Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
        List structure: [(word, model_name, vector, cosine_sim)]
    """
    # Load model
    model = load_word2vec_model(f'models/{time_slice_model}.model')
    model_dict = model_dictionary(model)
    
    # Extract vectors and names from model_dict
    all_vector_names = list(model_dict.keys())
    all_vectors = list(model_dict.values())

    # Scale vectors
    scaler = StandardScaler()
    vectors_scaled = scaler.fit_transform(all_vectors)
    
    # Make t-SNE model and fit it to the scaled vectors
    tsne_model = TSNE(n_components=3, random_state=0)
    tsne_result = tsne_model.fit_transform(vectors_scaled)
    
    # Associate the names with the 3D representations
    result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
    
    # Only keep the vectors that are in vectors_list and their cosine similarities
    result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
    result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
    
    # Create DataFrame from the transformed vectors
    df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
    
    # Sort dataframe by cosine_sim
    df = df.sort_values(by='cosine_sim', ascending=False)
    
    x = df['3d_vector'].apply(lambda v: v[0])
    y = df['3d_vector'].apply(lambda v: v[1])
    z = df['3d_vector'].apply(lambda v: v[2])
    
    # Plot
    fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
    fig.update_traces(marker=dict(size=5))
    fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
    
    return fig, df