Spaces:
Sleeping
Sleeping
File size: 8,023 Bytes
418ff33 2435506 418ff33 2435506 418ff33 2435506 418ff33 2435506 418ff33 2435506 418ff33 2435506 418ff33 2435506 418ff33 2435506 418ff33 2435506 418ff33 2435506 418ff33 2435506 418ff33 2435506 418ff33 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import gradio as gr
import numpy as np
from scipy.spatial.distance import cosine
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# --- Simulate a small pre-trained Word2Vec model ---
# Dummy word vectors for demonstration (4D for richer visualization)
dummy_word_vectors = {
'cat': np.array([0.9, 0.7, 0.1, 0.2]),
'dog': np.array([0.8, 0.8, 0.3, 0.1]),
'kitten': np.array([0.85, 0.75, 0.15, 0.25]),
'puppy': np.array([0.75, 0.85, 0.25, 0.15]),
'fish': np.array([0.1, 0.2, 0.9, 0.8]),
'bird': np.array([0.2, 0.1, 0.8, 0.9]),
'ocean': np.array([0.05, 0.15, 0.95, 0.85]),
'sky': np.array([0.25, 0.05, 0.85, 0.95]),
'run': np.array([0.6, 0.3, 0.1, 0.1]),
'walk': np.array([0.55, 0.35, 0.15, 0.05]),
'jump': np.array([0.65, 0.25, 0.05, 0.15]),
'king': np.array([0.9, 0.1, 0.1, 0.8]),
'queen': np.array([0.8, 0.2, 0.2, 0.9]),
'man': np.array([0.9, 0.15, 0.05, 0.7]),
'woman': np.array([0.85, 0.1, 0.15, 0.85]),
'prince': np.array([0.88, 0.12, 0.12, 0.82]),
'princess': np.array([0.83, 0.18, 0.18, 0.88])
}
# Normalize vectors (important for cosine similarity)
for word, vec in dummy_word_vectors.items():
dummy_word_vectors[word] = vec / np.linalg.norm(vec)
# --- Function to find nearest neighbors and generate plot ---
def find_nearest_neighbors_and_plot(search_word_input):
search_word = search_word_input.lower()
if search_word not in dummy_word_vectors:
return (
None, # No plot
pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]),
"Warning: Word not found!"
)
target_vector = dummy_word_vectors[search_word]
similarities = []
# Collect words and vectors for PCA
words_to_plot = [search_word]
vectors_to_plot = [target_vector]
for word, vector in dummy_word_vectors.items():
if word != search_word: # Don't compare a word to itself
similarity = 1 - cosine(target_vector, vector)
similarities.append({"Word": word, "Cosine Similarity": similarity})
results_df = pd.DataFrame(similarities).sort_values(
by="Cosine Similarity", ascending=False
).reset_index(drop=True)
# Add top N neighbors to plot (e.g., top 5)
top_n = 5
for _, row in results_df.head(top_n).iterrows():
words_to_plot.append(row["Word"])
vectors_to_plot.append(dummy_word_vectors[row["Word"]])
# Convert to numpy array for PCA
vectors_array = np.array(vectors_to_plot)
# Perform PCA to reduce to 2 dimensions for plotting
pca = PCA(n_components=2)
# Fit PCA on all dummy vectors first to get a consistent mapping
# This helps keep the relative positions meaningful across different searches.
all_vectors_array = np.array(list(dummy_word_vectors.values()))
pca.fit(all_vectors_array)
# Transform only the selected vectors
transformed_vectors = pca.transform(vectors_array)
# Create the plot
fig, ax = plt.subplots(figsize=(8, 8))
# Plot all words in the dummy vocabulary as light grey points
# to provide some context for the PCA space
all_transformed_vectors = pca.transform(all_vectors_array)
all_words = list(dummy_word_vectors.keys())
for i, word in enumerate(all_words):
ax.scatter(all_transformed_vectors[i, 0], all_transformed_vectors[i, 1],
color='lightgray', alpha=0.5, s=50)
ax.text(all_transformed_vectors[i, 0] + 0.01, all_transformed_vectors[i, 1] + 0.01, word,
fontsize=8, color='darkgray')
# Plot selected words
for i, word in enumerate(words_to_plot):
x, y = transformed_vectors[i]
color = 'red' if word == search_word else 'blue'
marker = 'D' if word == search_word else 'o' # Diamond for search word
ax.scatter(x, y, color=color, label=word, marker=marker, s=150 if word == search_word else 100, edgecolor='black', zorder=5)
ax.text(x + 0.01, y + 0.01, word, fontsize=10, weight='bold' if word == search_word else 'normal', color=color, zorder=6)
# Draw vector from origin to point (simulating conceptual vectors)
ax.plot([0, x], [0, y], color=color, linestyle='--', linewidth=1, alpha=0.7)
# Draw arrows from search word to its neighbors (optional, but good for intuition)
search_word_x, search_word_y = transformed_vectors[0]
for i in range(1, len(transformed_vectors)):
neighbor_x, neighbor_y = transformed_vectors[i]
# Calculate angle and display for top 1
if i == 1: # Only for the closest neighbor
vec1 = transformed_vectors[0] - np.array([0,0]) # Vector from origin to search word
vec2 = transformed_vectors[i] - np.array([0,0]) # Vector from origin to neighbor
# Use original 4D vectors for actual cosine similarity calculation
original_vec1 = target_vector
original_vec2 = dummy_word_vectors[words_to_plot[i]]
sim_val = 1 - cosine(original_vec1, original_vec2)
angle_rad = np.arccos(np.clip(sim_val, -1.0, 1.0)) # Clip to handle potential float precision issues
angle_deg = np.degrees(angle_rad)
ax.annotate(f"{angle_deg:.1f}°", xy=((vec1[0]+vec2[0])/2, (vec1[1]+vec2[1])/2),
xytext=(search_word_x + 0.05, search_word_y + 0.05),
arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=5),
fontsize=9, color='green', weight='bold')
ax.set_title(f"2D Projection of '{search_word}' and its Nearest Neighbors")
ax.set_xlabel(f"PCA Component 1 (explains {pca.explained_variance_ratio_[0]*100:.1f}%)")
ax.set_ylabel(f"PCA Component 2 (explains {pca.explained_variance_ratio_[1]*100:.1f}%)")
ax.grid(True, linestyle=':', alpha=0.6)
ax.axhline(0, color='gray', linewidth=0.5)
ax.axvline(0, color='gray', linewidth=0.5)
ax.set_aspect('equal', adjustable='box')
plt.tight_layout()
# Format the DataFrame for better display in Gradio
results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4)
results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity
message = f"Found nearest neighbors for '{search_word}'! " \
f"Red diamond is the search word, blue circles are its closest neighbors. " \
f"The angle annotation shows the angle between the search word and its closest neighbor."
return fig, results_df, message
# --- Gradio Interface ---
iface = gr.Interface(
fn=find_nearest_neighbors_and_plot,
inputs=gr.Textbox(
label="Enter a word to explore its neighbors:",
placeholder="e.g., cat, king, fish"
),
outputs=[
gr.Plot(label="Word Vector Visualization (PCA 2D)"),
gr.DataFrame(
headers=["Neighbor Word", "Similarity Score"],
row_count=5, # Display up to 5 rows by default
wrap=True,
interactive=False,
label="Nearest Neighbors"
),
gr.Markdown(
label="Status"
)
],
title="🚀 Word Vector Explorer: Visualize & Understand Cosine Similarity!",
description=(
"Type a word to see its nearest semantic neighbors in the vector space, along with a 2D visualization! "
"The angle between vectors on the plot is a visual representation of **Cosine Similarity** "
"(smaller angle = higher similarity). "
"<br>_Note: This POC uses dummy 4D word vectors projected to 2D using PCA. "
"In a full version, this would connect to a large pre-trained Word2Vec model!_"
),
allow_flagging="never", # Optional: disables the "Flag" button
examples=[
["cat"],
["king"],
["fish"],
["run"]
]
)
if __name__ == "__main__":
iface.launch() |