Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
from scipy.spatial.distance import cosine | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from sklearn.decomposition import PCA | |
# --- Simulate a small pre-trained Word2Vec model --- | |
# Dummy word vectors for demonstration (4D for richer visualization) | |
dummy_word_vectors = { | |
'cat': np.array([0.9, 0.7, 0.1, 0.2]), | |
'dog': np.array([0.8, 0.8, 0.3, 0.1]), | |
'kitten': np.array([0.85, 0.75, 0.15, 0.25]), | |
'puppy': np.array([0.75, 0.85, 0.25, 0.15]), | |
'fish': np.array([0.1, 0.2, 0.9, 0.8]), | |
'bird': np.array([0.2, 0.1, 0.8, 0.9]), | |
'ocean': np.array([0.05, 0.15, 0.95, 0.85]), | |
'sky': np.array([0.25, 0.05, 0.85, 0.95]), | |
'run': np.array([0.6, 0.3, 0.1, 0.1]), | |
'walk': np.array([0.55, 0.35, 0.15, 0.05]), | |
'jump': np.array([0.65, 0.25, 0.05, 0.15]), | |
'king': np.array([0.9, 0.1, 0.1, 0.8]), | |
'queen': np.array([0.8, 0.2, 0.2, 0.9]), | |
'man': np.array([0.9, 0.15, 0.05, 0.7]), | |
'woman': np.array([0.85, 0.1, 0.15, 0.85]), | |
'prince': np.array([0.88, 0.12, 0.12, 0.82]), | |
'princess': np.array([0.83, 0.18, 0.18, 0.88]) | |
} | |
# Normalize vectors (important for cosine similarity) | |
for word, vec in dummy_word_vectors.items(): | |
dummy_word_vectors[word] = vec / np.linalg.norm(vec) | |
# --- Function to find nearest neighbors and generate plot --- | |
def find_nearest_neighbors_and_plot(search_word_input): | |
search_word = search_word_input.lower() | |
if search_word not in dummy_word_vectors: | |
return ( | |
None, # No plot | |
pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]), | |
"Warning: Word not found!" | |
) | |
target_vector = dummy_word_vectors[search_word] | |
similarities = [] | |
# Collect words and vectors for PCA | |
words_to_plot = [search_word] | |
vectors_to_plot = [target_vector] | |
for word, vector in dummy_word_vectors.items(): | |
if word != search_word: # Don't compare a word to itself | |
similarity = 1 - cosine(target_vector, vector) | |
similarities.append({"Word": word, "Cosine Similarity": similarity}) | |
results_df = pd.DataFrame(similarities).sort_values( | |
by="Cosine Similarity", ascending=False | |
).reset_index(drop=True) | |
# Add top N neighbors to plot (e.g., top 5) | |
top_n = 5 | |
for _, row in results_df.head(top_n).iterrows(): | |
words_to_plot.append(row["Word"]) | |
vectors_to_plot.append(dummy_word_vectors[row["Word"]]) | |
# Convert to numpy array for PCA | |
vectors_array = np.array(vectors_to_plot) | |
# Perform PCA to reduce to 2 dimensions for plotting | |
pca = PCA(n_components=2) | |
# Fit PCA on all dummy vectors first to get a consistent mapping | |
# This helps keep the relative positions meaningful across different searches. | |
all_vectors_array = np.array(list(dummy_word_vectors.values())) | |
pca.fit(all_vectors_array) | |
# Transform only the selected vectors | |
transformed_vectors = pca.transform(vectors_array) | |
# Create the plot | |
fig, ax = plt.subplots(figsize=(8, 8)) | |
# Plot all words in the dummy vocabulary as light grey points | |
# to provide some context for the PCA space | |
all_transformed_vectors = pca.transform(all_vectors_array) | |
all_words = list(dummy_word_vectors.keys()) | |
for i, word in enumerate(all_words): | |
ax.scatter(all_transformed_vectors[i, 0], all_transformed_vectors[i, 1], | |
color='lightgray', alpha=0.5, s=50) | |
ax.text(all_transformed_vectors[i, 0] + 0.01, all_transformed_vectors[i, 1] + 0.01, word, | |
fontsize=8, color='darkgray') | |
# Plot selected words | |
for i, word in enumerate(words_to_plot): | |
x, y = transformed_vectors[i] | |
color = 'red' if word == search_word else 'blue' | |
marker = 'D' if word == search_word else 'o' # Diamond for search word | |
ax.scatter(x, y, color=color, label=word, marker=marker, s=150 if word == search_word else 100, edgecolor='black', zorder=5) | |
ax.text(x + 0.01, y + 0.01, word, fontsize=10, weight='bold' if word == search_word else 'normal', color=color, zorder=6) | |
# Draw vector from origin to point (simulating conceptual vectors) | |
ax.plot([0, x], [0, y], color=color, linestyle='--', linewidth=1, alpha=0.7) | |
# Draw arrows from search word to its neighbors (optional, but good for intuition) | |
search_word_x, search_word_y = transformed_vectors[0] | |
for i in range(1, len(transformed_vectors)): | |
neighbor_x, neighbor_y = transformed_vectors[i] | |
# Calculate angle and display for top 1 | |
if i == 1: # Only for the closest neighbor | |
vec1 = transformed_vectors[0] - np.array([0,0]) # Vector from origin to search word | |
vec2 = transformed_vectors[i] - np.array([0,0]) # Vector from origin to neighbor | |
# Use original 4D vectors for actual cosine similarity calculation | |
original_vec1 = target_vector | |
original_vec2 = dummy_word_vectors[words_to_plot[i]] | |
sim_val = 1 - cosine(original_vec1, original_vec2) | |
angle_rad = np.arccos(np.clip(sim_val, -1.0, 1.0)) # Clip to handle potential float precision issues | |
angle_deg = np.degrees(angle_rad) | |
ax.annotate(f"{angle_deg:.1f}°", xy=((vec1[0]+vec2[0])/2, (vec1[1]+vec2[1])/2), | |
xytext=(search_word_x + 0.05, search_word_y + 0.05), | |
arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=5), | |
fontsize=9, color='green', weight='bold') | |
ax.set_title(f"2D Projection of '{search_word}' and its Nearest Neighbors") | |
ax.set_xlabel(f"PCA Component 1 (explains {pca.explained_variance_ratio_[0]*100:.1f}%)") | |
ax.set_ylabel(f"PCA Component 2 (explains {pca.explained_variance_ratio_[1]*100:.1f}%)") | |
ax.grid(True, linestyle=':', alpha=0.6) | |
ax.axhline(0, color='gray', linewidth=0.5) | |
ax.axvline(0, color='gray', linewidth=0.5) | |
ax.set_aspect('equal', adjustable='box') | |
plt.tight_layout() | |
# Format the DataFrame for better display in Gradio | |
results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4) | |
results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity | |
message = f"Found nearest neighbors for '{search_word}'! " \ | |
f"Red diamond is the search word, blue circles are its closest neighbors. " \ | |
f"The angle annotation shows the angle between the search word and its closest neighbor." | |
return fig, results_df, message | |
# --- Gradio Interface --- | |
iface = gr.Interface( | |
fn=find_nearest_neighbors_and_plot, | |
inputs=gr.Textbox( | |
label="Enter a word to explore its neighbors:", | |
placeholder="e.g., cat, king, fish" | |
), | |
outputs=[ | |
gr.Plot(label="Word Vector Visualization (PCA 2D)"), | |
gr.DataFrame( | |
headers=["Neighbor Word", "Similarity Score"], | |
row_count=5, # Display up to 5 rows by default | |
wrap=True, | |
interactive=False, | |
label="Nearest Neighbors" | |
), | |
gr.Markdown( | |
label="Status" | |
) | |
], | |
title="🚀 Word Vector Explorer: Visualize & Understand Cosine Similarity!", | |
description=( | |
"Type a word to see its nearest semantic neighbors in the vector space, along with a 2D visualization! " | |
"The angle between vectors on the plot is a visual representation of **Cosine Similarity** " | |
"(smaller angle = higher similarity). " | |
"<br>_Note: This POC uses dummy 4D word vectors projected to 2D using PCA. " | |
"In a full version, this would connect to a large pre-trained Word2Vec model!_" | |
), | |
allow_flagging="never", # Optional: disables the "Flag" button | |
examples=[ | |
["cat"], | |
["king"], | |
["fish"], | |
["run"] | |
] | |
) | |
if __name__ == "__main__": | |
iface.launch() |