File size: 8,023 Bytes
418ff33
 
 
 
2435506
 
418ff33
 
2435506
418ff33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2435506
 
 
418ff33
 
 
 
 
 
2435506
 
418ff33
 
 
 
2435506
418ff33
 
 
 
 
 
2435506
 
 
 
 
418ff33
 
 
 
 
 
 
 
 
2435506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418ff33
 
 
 
2435506
 
 
 
 
418ff33
 
 
2435506
418ff33
 
 
 
 
2435506
418ff33
 
 
 
 
 
 
 
 
 
 
2435506
418ff33
2435506
 
 
 
 
418ff33
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import gradio as gr
import numpy as np
from scipy.spatial.distance import cosine
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# --- Simulate a small pre-trained Word2Vec model ---
# Dummy word vectors for demonstration (4D for richer visualization)
dummy_word_vectors = {
    'cat': np.array([0.9, 0.7, 0.1, 0.2]),
    'dog': np.array([0.8, 0.8, 0.3, 0.1]),
    'kitten': np.array([0.85, 0.75, 0.15, 0.25]),
    'puppy': np.array([0.75, 0.85, 0.25, 0.15]),
    'fish': np.array([0.1, 0.2, 0.9, 0.8]),
    'bird': np.array([0.2, 0.1, 0.8, 0.9]),
    'ocean': np.array([0.05, 0.15, 0.95, 0.85]),
    'sky': np.array([0.25, 0.05, 0.85, 0.95]),
    'run': np.array([0.6, 0.3, 0.1, 0.1]),
    'walk': np.array([0.55, 0.35, 0.15, 0.05]),
    'jump': np.array([0.65, 0.25, 0.05, 0.15]),
    'king': np.array([0.9, 0.1, 0.1, 0.8]),
    'queen': np.array([0.8, 0.2, 0.2, 0.9]),
    'man': np.array([0.9, 0.15, 0.05, 0.7]),
    'woman': np.array([0.85, 0.1, 0.15, 0.85]),
    'prince': np.array([0.88, 0.12, 0.12, 0.82]),
    'princess': np.array([0.83, 0.18, 0.18, 0.88])
}

# Normalize vectors (important for cosine similarity)
for word, vec in dummy_word_vectors.items():
    dummy_word_vectors[word] = vec / np.linalg.norm(vec)

# --- Function to find nearest neighbors and generate plot ---
def find_nearest_neighbors_and_plot(search_word_input):
    search_word = search_word_input.lower()

    if search_word not in dummy_word_vectors:
        return (
            None, # No plot
            pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]),
            "Warning: Word not found!"
        )

    target_vector = dummy_word_vectors[search_word]
    similarities = []
    
    # Collect words and vectors for PCA
    words_to_plot = [search_word]
    vectors_to_plot = [target_vector]

    for word, vector in dummy_word_vectors.items():
        if word != search_word: # Don't compare a word to itself
            similarity = 1 - cosine(target_vector, vector)
            similarities.append({"Word": word, "Cosine Similarity": similarity})

    results_df = pd.DataFrame(similarities).sort_values(
        by="Cosine Similarity", ascending=False
    ).reset_index(drop=True)

    # Add top N neighbors to plot (e.g., top 5)
    top_n = 5
    for _, row in results_df.head(top_n).iterrows():
        words_to_plot.append(row["Word"])
        vectors_to_plot.append(dummy_word_vectors[row["Word"]])

    # Convert to numpy array for PCA
    vectors_array = np.array(vectors_to_plot)

    # Perform PCA to reduce to 2 dimensions for plotting
    pca = PCA(n_components=2)
    # Fit PCA on all dummy vectors first to get a consistent mapping
    # This helps keep the relative positions meaningful across different searches.
    all_vectors_array = np.array(list(dummy_word_vectors.values()))
    pca.fit(all_vectors_array)
    
    # Transform only the selected vectors
    transformed_vectors = pca.transform(vectors_array)

    # Create the plot
    fig, ax = plt.subplots(figsize=(8, 8))
    
    # Plot all words in the dummy vocabulary as light grey points
    # to provide some context for the PCA space
    all_transformed_vectors = pca.transform(all_vectors_array)
    all_words = list(dummy_word_vectors.keys())
    for i, word in enumerate(all_words):
        ax.scatter(all_transformed_vectors[i, 0], all_transformed_vectors[i, 1], 
                   color='lightgray', alpha=0.5, s=50)
        ax.text(all_transformed_vectors[i, 0] + 0.01, all_transformed_vectors[i, 1] + 0.01, word, 
                fontsize=8, color='darkgray')

    # Plot selected words
    for i, word in enumerate(words_to_plot):
        x, y = transformed_vectors[i]
        color = 'red' if word == search_word else 'blue'
        marker = 'D' if word == search_word else 'o' # Diamond for search word
        
        ax.scatter(x, y, color=color, label=word, marker=marker, s=150 if word == search_word else 100, edgecolor='black', zorder=5)
        ax.text(x + 0.01, y + 0.01, word, fontsize=10, weight='bold' if word == search_word else 'normal', color=color, zorder=6)
        
        # Draw vector from origin to point (simulating conceptual vectors)
        ax.plot([0, x], [0, y], color=color, linestyle='--', linewidth=1, alpha=0.7)

    # Draw arrows from search word to its neighbors (optional, but good for intuition)
    search_word_x, search_word_y = transformed_vectors[0]
    for i in range(1, len(transformed_vectors)):
        neighbor_x, neighbor_y = transformed_vectors[i]
        # Calculate angle and display for top 1
        if i == 1: # Only for the closest neighbor
            vec1 = transformed_vectors[0] - np.array([0,0]) # Vector from origin to search word
            vec2 = transformed_vectors[i] - np.array([0,0]) # Vector from origin to neighbor
            
            # Use original 4D vectors for actual cosine similarity calculation
            original_vec1 = target_vector
            original_vec2 = dummy_word_vectors[words_to_plot[i]]
            
            sim_val = 1 - cosine(original_vec1, original_vec2)
            angle_rad = np.arccos(np.clip(sim_val, -1.0, 1.0)) # Clip to handle potential float precision issues
            angle_deg = np.degrees(angle_rad)
            ax.annotate(f"{angle_deg:.1f}°", xy=((vec1[0]+vec2[0])/2, (vec1[1]+vec2[1])/2), 
                        xytext=(search_word_x + 0.05, search_word_y + 0.05),
                        arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=5),
                        fontsize=9, color='green', weight='bold')


    ax.set_title(f"2D Projection of '{search_word}' and its Nearest Neighbors")
    ax.set_xlabel(f"PCA Component 1 (explains {pca.explained_variance_ratio_[0]*100:.1f}%)")
    ax.set_ylabel(f"PCA Component 2 (explains {pca.explained_variance_ratio_[1]*100:.1f}%)")
    ax.grid(True, linestyle=':', alpha=0.6)
    ax.axhline(0, color='gray', linewidth=0.5)
    ax.axvline(0, color='gray', linewidth=0.5)
    ax.set_aspect('equal', adjustable='box')
    plt.tight_layout()

    # Format the DataFrame for better display in Gradio
    results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4)
    results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity

    message = f"Found nearest neighbors for '{search_word}'! " \
              f"Red diamond is the search word, blue circles are its closest neighbors. " \
              f"The angle annotation shows the angle between the search word and its closest neighbor."

    return fig, results_df, message

# --- Gradio Interface ---
iface = gr.Interface(
    fn=find_nearest_neighbors_and_plot,
    inputs=gr.Textbox(
        label="Enter a word to explore its neighbors:",
        placeholder="e.g., cat, king, fish"
    ),
    outputs=[
        gr.Plot(label="Word Vector Visualization (PCA 2D)"),
        gr.DataFrame(
            headers=["Neighbor Word", "Similarity Score"],
            row_count=5, # Display up to 5 rows by default
            wrap=True,
            interactive=False,
            label="Nearest Neighbors"
        ),
        gr.Markdown(
            label="Status"
        )
    ],
    title="🚀 Word Vector Explorer: Visualize & Understand Cosine Similarity!",
    description=(
        "Type a word to see its nearest semantic neighbors in the vector space, along with a 2D visualization! "
        "The angle between vectors on the plot is a visual representation of **Cosine Similarity** "
        "(smaller angle = higher similarity). "
        "<br>_Note: This POC uses dummy 4D word vectors projected to 2D using PCA. "
        "In a full version, this would connect to a large pre-trained Word2Vec model!_"
    ),
    allow_flagging="never", # Optional: disables the "Flag" button
    examples=[
        ["cat"],
        ["king"],
        ["fish"],
        ["run"]
    ]
)

if __name__ == "__main__":
    iface.launch()