LanguageGames2 / app.py
fractalz's picture
Update app.py
2435506 verified
import gradio as gr
import numpy as np
from scipy.spatial.distance import cosine
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# --- Simulate a small pre-trained Word2Vec model ---
# Dummy word vectors for demonstration (4D for richer visualization)
dummy_word_vectors = {
'cat': np.array([0.9, 0.7, 0.1, 0.2]),
'dog': np.array([0.8, 0.8, 0.3, 0.1]),
'kitten': np.array([0.85, 0.75, 0.15, 0.25]),
'puppy': np.array([0.75, 0.85, 0.25, 0.15]),
'fish': np.array([0.1, 0.2, 0.9, 0.8]),
'bird': np.array([0.2, 0.1, 0.8, 0.9]),
'ocean': np.array([0.05, 0.15, 0.95, 0.85]),
'sky': np.array([0.25, 0.05, 0.85, 0.95]),
'run': np.array([0.6, 0.3, 0.1, 0.1]),
'walk': np.array([0.55, 0.35, 0.15, 0.05]),
'jump': np.array([0.65, 0.25, 0.05, 0.15]),
'king': np.array([0.9, 0.1, 0.1, 0.8]),
'queen': np.array([0.8, 0.2, 0.2, 0.9]),
'man': np.array([0.9, 0.15, 0.05, 0.7]),
'woman': np.array([0.85, 0.1, 0.15, 0.85]),
'prince': np.array([0.88, 0.12, 0.12, 0.82]),
'princess': np.array([0.83, 0.18, 0.18, 0.88])
}
# Normalize vectors (important for cosine similarity)
for word, vec in dummy_word_vectors.items():
dummy_word_vectors[word] = vec / np.linalg.norm(vec)
# --- Function to find nearest neighbors and generate plot ---
def find_nearest_neighbors_and_plot(search_word_input):
search_word = search_word_input.lower()
if search_word not in dummy_word_vectors:
return (
None, # No plot
pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]),
"Warning: Word not found!"
)
target_vector = dummy_word_vectors[search_word]
similarities = []
# Collect words and vectors for PCA
words_to_plot = [search_word]
vectors_to_plot = [target_vector]
for word, vector in dummy_word_vectors.items():
if word != search_word: # Don't compare a word to itself
similarity = 1 - cosine(target_vector, vector)
similarities.append({"Word": word, "Cosine Similarity": similarity})
results_df = pd.DataFrame(similarities).sort_values(
by="Cosine Similarity", ascending=False
).reset_index(drop=True)
# Add top N neighbors to plot (e.g., top 5)
top_n = 5
for _, row in results_df.head(top_n).iterrows():
words_to_plot.append(row["Word"])
vectors_to_plot.append(dummy_word_vectors[row["Word"]])
# Convert to numpy array for PCA
vectors_array = np.array(vectors_to_plot)
# Perform PCA to reduce to 2 dimensions for plotting
pca = PCA(n_components=2)
# Fit PCA on all dummy vectors first to get a consistent mapping
# This helps keep the relative positions meaningful across different searches.
all_vectors_array = np.array(list(dummy_word_vectors.values()))
pca.fit(all_vectors_array)
# Transform only the selected vectors
transformed_vectors = pca.transform(vectors_array)
# Create the plot
fig, ax = plt.subplots(figsize=(8, 8))
# Plot all words in the dummy vocabulary as light grey points
# to provide some context for the PCA space
all_transformed_vectors = pca.transform(all_vectors_array)
all_words = list(dummy_word_vectors.keys())
for i, word in enumerate(all_words):
ax.scatter(all_transformed_vectors[i, 0], all_transformed_vectors[i, 1],
color='lightgray', alpha=0.5, s=50)
ax.text(all_transformed_vectors[i, 0] + 0.01, all_transformed_vectors[i, 1] + 0.01, word,
fontsize=8, color='darkgray')
# Plot selected words
for i, word in enumerate(words_to_plot):
x, y = transformed_vectors[i]
color = 'red' if word == search_word else 'blue'
marker = 'D' if word == search_word else 'o' # Diamond for search word
ax.scatter(x, y, color=color, label=word, marker=marker, s=150 if word == search_word else 100, edgecolor='black', zorder=5)
ax.text(x + 0.01, y + 0.01, word, fontsize=10, weight='bold' if word == search_word else 'normal', color=color, zorder=6)
# Draw vector from origin to point (simulating conceptual vectors)
ax.plot([0, x], [0, y], color=color, linestyle='--', linewidth=1, alpha=0.7)
# Draw arrows from search word to its neighbors (optional, but good for intuition)
search_word_x, search_word_y = transformed_vectors[0]
for i in range(1, len(transformed_vectors)):
neighbor_x, neighbor_y = transformed_vectors[i]
# Calculate angle and display for top 1
if i == 1: # Only for the closest neighbor
vec1 = transformed_vectors[0] - np.array([0,0]) # Vector from origin to search word
vec2 = transformed_vectors[i] - np.array([0,0]) # Vector from origin to neighbor
# Use original 4D vectors for actual cosine similarity calculation
original_vec1 = target_vector
original_vec2 = dummy_word_vectors[words_to_plot[i]]
sim_val = 1 - cosine(original_vec1, original_vec2)
angle_rad = np.arccos(np.clip(sim_val, -1.0, 1.0)) # Clip to handle potential float precision issues
angle_deg = np.degrees(angle_rad)
ax.annotate(f"{angle_deg:.1f}°", xy=((vec1[0]+vec2[0])/2, (vec1[1]+vec2[1])/2),
xytext=(search_word_x + 0.05, search_word_y + 0.05),
arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=5),
fontsize=9, color='green', weight='bold')
ax.set_title(f"2D Projection of '{search_word}' and its Nearest Neighbors")
ax.set_xlabel(f"PCA Component 1 (explains {pca.explained_variance_ratio_[0]*100:.1f}%)")
ax.set_ylabel(f"PCA Component 2 (explains {pca.explained_variance_ratio_[1]*100:.1f}%)")
ax.grid(True, linestyle=':', alpha=0.6)
ax.axhline(0, color='gray', linewidth=0.5)
ax.axvline(0, color='gray', linewidth=0.5)
ax.set_aspect('equal', adjustable='box')
plt.tight_layout()
# Format the DataFrame for better display in Gradio
results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4)
results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity
message = f"Found nearest neighbors for '{search_word}'! " \
f"Red diamond is the search word, blue circles are its closest neighbors. " \
f"The angle annotation shows the angle between the search word and its closest neighbor."
return fig, results_df, message
# --- Gradio Interface ---
iface = gr.Interface(
fn=find_nearest_neighbors_and_plot,
inputs=gr.Textbox(
label="Enter a word to explore its neighbors:",
placeholder="e.g., cat, king, fish"
),
outputs=[
gr.Plot(label="Word Vector Visualization (PCA 2D)"),
gr.DataFrame(
headers=["Neighbor Word", "Similarity Score"],
row_count=5, # Display up to 5 rows by default
wrap=True,
interactive=False,
label="Nearest Neighbors"
),
gr.Markdown(
label="Status"
)
],
title="🚀 Word Vector Explorer: Visualize & Understand Cosine Similarity!",
description=(
"Type a word to see its nearest semantic neighbors in the vector space, along with a 2D visualization! "
"The angle between vectors on the plot is a visual representation of **Cosine Similarity** "
"(smaller angle = higher similarity). "
"<br>_Note: This POC uses dummy 4D word vectors projected to 2D using PCA. "
"In a full version, this would connect to a large pre-trained Word2Vec model!_"
),
allow_flagging="never", # Optional: disables the "Flag" button
examples=[
["cat"],
["king"],
["fish"],
["run"]
]
)
if __name__ == "__main__":
iface.launch()