Spaces:

fractalz
/

LanguageGames2

Sleeping

App Files Files Community

LanguageGames2 / app.py

$fractalz's picture$

fractalz

Update app.py

2435506 verified 2 months ago

raw

history blame

8.02 kB

	import gradio as gr
	import numpy as np
	from scipy.spatial.distance import cosine
	import pandas as pd
	import matplotlib.pyplot as plt
	from sklearn.decomposition import PCA

	# --- Simulate a small pre-trained Word2Vec model ---
	# Dummy word vectors for demonstration (4D for richer visualization)
	dummy_word_vectors = {
	'cat': np.array([0.9, 0.7, 0.1, 0.2]),
	'dog': np.array([0.8, 0.8, 0.3, 0.1]),
	'kitten': np.array([0.85, 0.75, 0.15, 0.25]),
	'puppy': np.array([0.75, 0.85, 0.25, 0.15]),
	'fish': np.array([0.1, 0.2, 0.9, 0.8]),
	'bird': np.array([0.2, 0.1, 0.8, 0.9]),
	'ocean': np.array([0.05, 0.15, 0.95, 0.85]),
	'sky': np.array([0.25, 0.05, 0.85, 0.95]),
	'run': np.array([0.6, 0.3, 0.1, 0.1]),
	'walk': np.array([0.55, 0.35, 0.15, 0.05]),
	'jump': np.array([0.65, 0.25, 0.05, 0.15]),
	'king': np.array([0.9, 0.1, 0.1, 0.8]),
	'queen': np.array([0.8, 0.2, 0.2, 0.9]),
	'man': np.array([0.9, 0.15, 0.05, 0.7]),
	'woman': np.array([0.85, 0.1, 0.15, 0.85]),
	'prince': np.array([0.88, 0.12, 0.12, 0.82]),
	'princess': np.array([0.83, 0.18, 0.18, 0.88])
	}

	# Normalize vectors (important for cosine similarity)
	for word, vec in dummy_word_vectors.items():
	dummy_word_vectors[word] = vec / np.linalg.norm(vec)

	# --- Function to find nearest neighbors and generate plot ---
	def find_nearest_neighbors_and_plot(search_word_input):
	search_word = search_word_input.lower()

	if search_word not in dummy_word_vectors:
	return (
	None, # No plot
	pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]),
	"Warning: Word not found!"
	)

	target_vector = dummy_word_vectors[search_word]
	similarities = []

	# Collect words and vectors for PCA
	words_to_plot = [search_word]
	vectors_to_plot = [target_vector]

	for word, vector in dummy_word_vectors.items():
	if word != search_word: # Don't compare a word to itself
	similarity = 1 - cosine(target_vector, vector)
	similarities.append({"Word": word, "Cosine Similarity": similarity})

	results_df = pd.DataFrame(similarities).sort_values(
	by="Cosine Similarity", ascending=False
	).reset_index(drop=True)

	# Add top N neighbors to plot (e.g., top 5)
	top_n = 5
	for _, row in results_df.head(top_n).iterrows():
	words_to_plot.append(row["Word"])
	vectors_to_plot.append(dummy_word_vectors[row["Word"]])

	# Convert to numpy array for PCA
	vectors_array = np.array(vectors_to_plot)

	# Perform PCA to reduce to 2 dimensions for plotting
	pca = PCA(n_components=2)
	# Fit PCA on all dummy vectors first to get a consistent mapping
	# This helps keep the relative positions meaningful across different searches.
	all_vectors_array = np.array(list(dummy_word_vectors.values()))
	pca.fit(all_vectors_array)

	# Transform only the selected vectors
	transformed_vectors = pca.transform(vectors_array)

	# Create the plot
	fig, ax = plt.subplots(figsize=(8, 8))

	# Plot all words in the dummy vocabulary as light grey points
	# to provide some context for the PCA space
	all_transformed_vectors = pca.transform(all_vectors_array)
	all_words = list(dummy_word_vectors.keys())
	for i, word in enumerate(all_words):
	ax.scatter(all_transformed_vectors[i, 0], all_transformed_vectors[i, 1],
	color='lightgray', alpha=0.5, s=50)
	ax.text(all_transformed_vectors[i, 0] + 0.01, all_transformed_vectors[i, 1] + 0.01, word,
	fontsize=8, color='darkgray')

	# Plot selected words
	for i, word in enumerate(words_to_plot):
	x, y = transformed_vectors[i]
	color = 'red' if word == search_word else 'blue'
	marker = 'D' if word == search_word else 'o' # Diamond for search word

	ax.scatter(x, y, color=color, label=word, marker=marker, s=150 if word == search_word else 100, edgecolor='black', zorder=5)
	ax.text(x + 0.01, y + 0.01, word, fontsize=10, weight='bold' if word == search_word else 'normal', color=color, zorder=6)

	# Draw vector from origin to point (simulating conceptual vectors)
	ax.plot([0, x], [0, y], color=color, linestyle='--', linewidth=1, alpha=0.7)

	# Draw arrows from search word to its neighbors (optional, but good for intuition)
	search_word_x, search_word_y = transformed_vectors[0]
	for i in range(1, len(transformed_vectors)):
	neighbor_x, neighbor_y = transformed_vectors[i]
	# Calculate angle and display for top 1
	if i == 1: # Only for the closest neighbor
	vec1 = transformed_vectors[0] - np.array([0,0]) # Vector from origin to search word
	vec2 = transformed_vectors[i] - np.array([0,0]) # Vector from origin to neighbor

	# Use original 4D vectors for actual cosine similarity calculation
	original_vec1 = target_vector
	original_vec2 = dummy_word_vectors[words_to_plot[i]]

	sim_val = 1 - cosine(original_vec1, original_vec2)
	angle_rad = np.arccos(np.clip(sim_val, -1.0, 1.0)) # Clip to handle potential float precision issues
	angle_deg = np.degrees(angle_rad)
	ax.annotate(f"{angle_deg:.1f}°", xy=((vec1[0]+vec2[0])/2, (vec1[1]+vec2[1])/2),
	xytext=(search_word_x + 0.05, search_word_y + 0.05),
	arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=5),
	fontsize=9, color='green', weight='bold')


	ax.set_title(f"2D Projection of '{search_word}' and its Nearest Neighbors")
	ax.set_xlabel(f"PCA Component 1 (explains {pca.explained_variance_ratio_[0]*100:.1f}%)")
	ax.set_ylabel(f"PCA Component 2 (explains {pca.explained_variance_ratio_[1]*100:.1f}%)")
	ax.grid(True, linestyle=':', alpha=0.6)
	ax.axhline(0, color='gray', linewidth=0.5)
	ax.axvline(0, color='gray', linewidth=0.5)
	ax.set_aspect('equal', adjustable='box')
	plt.tight_layout()

	# Format the DataFrame for better display in Gradio
	results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4)
	results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity

	message = f"Found nearest neighbors for '{search_word}'! " \
	f"Red diamond is the search word, blue circles are its closest neighbors. " \
	f"The angle annotation shows the angle between the search word and its closest neighbor."

	return fig, results_df, message

	# --- Gradio Interface ---
	iface = gr.Interface(
	fn=find_nearest_neighbors_and_plot,
	inputs=gr.Textbox(
	label="Enter a word to explore its neighbors:",
	placeholder="e.g., cat, king, fish"
	),
	outputs=[
	gr.Plot(label="Word Vector Visualization (PCA 2D)"),
	gr.DataFrame(
	headers=["Neighbor Word", "Similarity Score"],
	row_count=5, # Display up to 5 rows by default
	wrap=True,
	interactive=False,
	label="Nearest Neighbors"
	),
	gr.Markdown(
	label="Status"
	)
	],
	title="🚀 Word Vector Explorer: Visualize & Understand Cosine Similarity!",
	description=(
	"Type a word to see its nearest semantic neighbors in the vector space, along with a 2D visualization! "
	"The angle between vectors on the plot is a visual representation of Cosine Similarity "
	"(smaller angle = higher similarity). "
	"<br>_Note: This POC uses dummy 4D word vectors projected to 2D using PCA. "
	"In a full version, this would connect to a large pre-trained Word2Vec model!_"
	),
	allow_flagging="never", # Optional: disables the "Flag" button
	examples=[
	["cat"],
	["king"],
	["fish"],
	["run"]
	]
	)

	if __name__ == "__main__":
	iface.launch()