LanguageGames2 / app.py
fractalz's picture
Update app.py
2435506 verified
raw
history blame
8.02 kB
import gradio as gr
import numpy as np
from scipy.spatial.distance import cosine
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# --- Simulate a small pre-trained Word2Vec model ---
# Dummy word vectors for demonstration (4D for richer visualization)
dummy_word_vectors = {
'cat': np.array([0.9, 0.7, 0.1, 0.2]),
'dog': np.array([0.8, 0.8, 0.3, 0.1]),
'kitten': np.array([0.85, 0.75, 0.15, 0.25]),
'puppy': np.array([0.75, 0.85, 0.25, 0.15]),
'fish': np.array([0.1, 0.2, 0.9, 0.8]),
'bird': np.array([0.2, 0.1, 0.8, 0.9]),
'ocean': np.array([0.05, 0.15, 0.95, 0.85]),
'sky': np.array([0.25, 0.05, 0.85, 0.95]),
'run': np.array([0.6, 0.3, 0.1, 0.1]),
'walk': np.array([0.55, 0.35, 0.15, 0.05]),
'jump': np.array([0.65, 0.25, 0.05, 0.15]),
'king': np.array([0.9, 0.1, 0.1, 0.8]),
'queen': np.array([0.8, 0.2, 0.2, 0.9]),
'man': np.array([0.9, 0.15, 0.05, 0.7]),
'woman': np.array([0.85, 0.1, 0.15, 0.85]),
'prince': np.array([0.88, 0.12, 0.12, 0.82]),
'princess': np.array([0.83, 0.18, 0.18, 0.88])
}
# Normalize vectors (important for cosine similarity)
for word, vec in dummy_word_vectors.items():
dummy_word_vectors[word] = vec / np.linalg.norm(vec)
# --- Function to find nearest neighbors and generate plot ---
def find_nearest_neighbors_and_plot(search_word_input):
search_word = search_word_input.lower()
if search_word not in dummy_word_vectors:
return (
None, # No plot
pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]),
"Warning: Word not found!"
)
target_vector = dummy_word_vectors[search_word]
similarities = []
# Collect words and vectors for PCA
words_to_plot = [search_word]
vectors_to_plot = [target_vector]
for word, vector in dummy_word_vectors.items():
if word != search_word: # Don't compare a word to itself
similarity = 1 - cosine(target_vector, vector)
similarities.append({"Word": word, "Cosine Similarity": similarity})
results_df = pd.DataFrame(similarities).sort_values(
by="Cosine Similarity", ascending=False
).reset_index(drop=True)
# Add top N neighbors to plot (e.g., top 5)
top_n = 5
for _, row in results_df.head(top_n).iterrows():
words_to_plot.append(row["Word"])
vectors_to_plot.append(dummy_word_vectors[row["Word"]])
# Convert to numpy array for PCA
vectors_array = np.array(vectors_to_plot)
# Perform PCA to reduce to 2 dimensions for plotting
pca = PCA(n_components=2)
# Fit PCA on all dummy vectors first to get a consistent mapping
# This helps keep the relative positions meaningful across different searches.
all_vectors_array = np.array(list(dummy_word_vectors.values()))
pca.fit(all_vectors_array)
# Transform only the selected vectors
transformed_vectors = pca.transform(vectors_array)
# Create the plot
fig, ax = plt.subplots(figsize=(8, 8))
# Plot all words in the dummy vocabulary as light grey points
# to provide some context for the PCA space
all_transformed_vectors = pca.transform(all_vectors_array)
all_words = list(dummy_word_vectors.keys())
for i, word in enumerate(all_words):
ax.scatter(all_transformed_vectors[i, 0], all_transformed_vectors[i, 1],
color='lightgray', alpha=0.5, s=50)
ax.text(all_transformed_vectors[i, 0] + 0.01, all_transformed_vectors[i, 1] + 0.01, word,
fontsize=8, color='darkgray')
# Plot selected words
for i, word in enumerate(words_to_plot):
x, y = transformed_vectors[i]
color = 'red' if word == search_word else 'blue'
marker = 'D' if word == search_word else 'o' # Diamond for search word
ax.scatter(x, y, color=color, label=word, marker=marker, s=150 if word == search_word else 100, edgecolor='black', zorder=5)
ax.text(x + 0.01, y + 0.01, word, fontsize=10, weight='bold' if word == search_word else 'normal', color=color, zorder=6)
# Draw vector from origin to point (simulating conceptual vectors)
ax.plot([0, x], [0, y], color=color, linestyle='--', linewidth=1, alpha=0.7)
# Draw arrows from search word to its neighbors (optional, but good for intuition)
search_word_x, search_word_y = transformed_vectors[0]
for i in range(1, len(transformed_vectors)):
neighbor_x, neighbor_y = transformed_vectors[i]
# Calculate angle and display for top 1
if i == 1: # Only for the closest neighbor
vec1 = transformed_vectors[0] - np.array([0,0]) # Vector from origin to search word
vec2 = transformed_vectors[i] - np.array([0,0]) # Vector from origin to neighbor
# Use original 4D vectors for actual cosine similarity calculation
original_vec1 = target_vector
original_vec2 = dummy_word_vectors[words_to_plot[i]]
sim_val = 1 - cosine(original_vec1, original_vec2)
angle_rad = np.arccos(np.clip(sim_val, -1.0, 1.0)) # Clip to handle potential float precision issues
angle_deg = np.degrees(angle_rad)
ax.annotate(f"{angle_deg:.1f}°", xy=((vec1[0]+vec2[0])/2, (vec1[1]+vec2[1])/2),
xytext=(search_word_x + 0.05, search_word_y + 0.05),
arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=5),
fontsize=9, color='green', weight='bold')
ax.set_title(f"2D Projection of '{search_word}' and its Nearest Neighbors")
ax.set_xlabel(f"PCA Component 1 (explains {pca.explained_variance_ratio_[0]*100:.1f}%)")
ax.set_ylabel(f"PCA Component 2 (explains {pca.explained_variance_ratio_[1]*100:.1f}%)")
ax.grid(True, linestyle=':', alpha=0.6)
ax.axhline(0, color='gray', linewidth=0.5)
ax.axvline(0, color='gray', linewidth=0.5)
ax.set_aspect('equal', adjustable='box')
plt.tight_layout()
# Format the DataFrame for better display in Gradio
results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4)
results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity
message = f"Found nearest neighbors for '{search_word}'! " \
f"Red diamond is the search word, blue circles are its closest neighbors. " \
f"The angle annotation shows the angle between the search word and its closest neighbor."
return fig, results_df, message
# --- Gradio Interface ---
iface = gr.Interface(
fn=find_nearest_neighbors_and_plot,
inputs=gr.Textbox(
label="Enter a word to explore its neighbors:",
placeholder="e.g., cat, king, fish"
),
outputs=[
gr.Plot(label="Word Vector Visualization (PCA 2D)"),
gr.DataFrame(
headers=["Neighbor Word", "Similarity Score"],
row_count=5, # Display up to 5 rows by default
wrap=True,
interactive=False,
label="Nearest Neighbors"
),
gr.Markdown(
label="Status"
)
],
title="🚀 Word Vector Explorer: Visualize & Understand Cosine Similarity!",
description=(
"Type a word to see its nearest semantic neighbors in the vector space, along with a 2D visualization! "
"The angle between vectors on the plot is a visual representation of **Cosine Similarity** "
"(smaller angle = higher similarity). "
"<br>_Note: This POC uses dummy 4D word vectors projected to 2D using PCA. "
"In a full version, this would connect to a large pre-trained Word2Vec model!_"
),
allow_flagging="never", # Optional: disables the "Flag" button
examples=[
["cat"],
["king"],
["fish"],
["run"]
]
)
if __name__ == "__main__":
iface.launch()