import gradio as gr import numpy as np from scipy.spatial.distance import cosine import pandas as pd import matplotlib.pyplot as plt from sklearn.decomposition import PCA # --- Simulate a small pre-trained Word2Vec model --- # Dummy word vectors for demonstration (4D for richer visualization) dummy_word_vectors = { 'cat': np.array([0.9, 0.7, 0.1, 0.2]), 'dog': np.array([0.8, 0.8, 0.3, 0.1]), 'kitten': np.array([0.85, 0.75, 0.15, 0.25]), 'puppy': np.array([0.75, 0.85, 0.25, 0.15]), 'fish': np.array([0.1, 0.2, 0.9, 0.8]), 'bird': np.array([0.2, 0.1, 0.8, 0.9]), 'ocean': np.array([0.05, 0.15, 0.95, 0.85]), 'sky': np.array([0.25, 0.05, 0.85, 0.95]), 'run': np.array([0.6, 0.3, 0.1, 0.1]), 'walk': np.array([0.55, 0.35, 0.15, 0.05]), 'jump': np.array([0.65, 0.25, 0.05, 0.15]), 'king': np.array([0.9, 0.1, 0.1, 0.8]), 'queen': np.array([0.8, 0.2, 0.2, 0.9]), 'man': np.array([0.9, 0.15, 0.05, 0.7]), 'woman': np.array([0.85, 0.1, 0.15, 0.85]), 'prince': np.array([0.88, 0.12, 0.12, 0.82]), 'princess': np.array([0.83, 0.18, 0.18, 0.88]) } # Normalize vectors (important for cosine similarity) for word, vec in dummy_word_vectors.items(): dummy_word_vectors[word] = vec / np.linalg.norm(vec) # --- Function to find nearest neighbors and generate plot --- def find_nearest_neighbors_and_plot(search_word_input): search_word = search_word_input.lower() if search_word not in dummy_word_vectors: return ( None, # No plot pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]), "Warning: Word not found!" ) target_vector = dummy_word_vectors[search_word] similarities = [] # Collect words and vectors for PCA words_to_plot = [search_word] vectors_to_plot = [target_vector] for word, vector in dummy_word_vectors.items(): if word != search_word: # Don't compare a word to itself similarity = 1 - cosine(target_vector, vector) similarities.append({"Word": word, "Cosine Similarity": similarity}) results_df = pd.DataFrame(similarities).sort_values( by="Cosine Similarity", ascending=False ).reset_index(drop=True) # Add top N neighbors to plot (e.g., top 5) top_n = 5 for _, row in results_df.head(top_n).iterrows(): words_to_plot.append(row["Word"]) vectors_to_plot.append(dummy_word_vectors[row["Word"]]) # Convert to numpy array for PCA vectors_array = np.array(vectors_to_plot) # Perform PCA to reduce to 2 dimensions for plotting pca = PCA(n_components=2) # Fit PCA on all dummy vectors first to get a consistent mapping # This helps keep the relative positions meaningful across different searches. all_vectors_array = np.array(list(dummy_word_vectors.values())) pca.fit(all_vectors_array) # Transform only the selected vectors transformed_vectors = pca.transform(vectors_array) # Create the plot fig, ax = plt.subplots(figsize=(8, 8)) # Plot all words in the dummy vocabulary as light grey points # to provide some context for the PCA space all_transformed_vectors = pca.transform(all_vectors_array) all_words = list(dummy_word_vectors.keys()) for i, word in enumerate(all_words): ax.scatter(all_transformed_vectors[i, 0], all_transformed_vectors[i, 1], color='lightgray', alpha=0.5, s=50) ax.text(all_transformed_vectors[i, 0] + 0.01, all_transformed_vectors[i, 1] + 0.01, word, fontsize=8, color='darkgray') # Plot selected words for i, word in enumerate(words_to_plot): x, y = transformed_vectors[i] color = 'red' if word == search_word else 'blue' marker = 'D' if word == search_word else 'o' # Diamond for search word ax.scatter(x, y, color=color, label=word, marker=marker, s=150 if word == search_word else 100, edgecolor='black', zorder=5) ax.text(x + 0.01, y + 0.01, word, fontsize=10, weight='bold' if word == search_word else 'normal', color=color, zorder=6) # Draw vector from origin to point (simulating conceptual vectors) ax.plot([0, x], [0, y], color=color, linestyle='--', linewidth=1, alpha=0.7) # Draw arrows from search word to its neighbors (optional, but good for intuition) search_word_x, search_word_y = transformed_vectors[0] for i in range(1, len(transformed_vectors)): neighbor_x, neighbor_y = transformed_vectors[i] # Calculate angle and display for top 1 if i == 1: # Only for the closest neighbor vec1 = transformed_vectors[0] - np.array([0,0]) # Vector from origin to search word vec2 = transformed_vectors[i] - np.array([0,0]) # Vector from origin to neighbor # Use original 4D vectors for actual cosine similarity calculation original_vec1 = target_vector original_vec2 = dummy_word_vectors[words_to_plot[i]] sim_val = 1 - cosine(original_vec1, original_vec2) angle_rad = np.arccos(np.clip(sim_val, -1.0, 1.0)) # Clip to handle potential float precision issues angle_deg = np.degrees(angle_rad) ax.annotate(f"{angle_deg:.1f}°", xy=((vec1[0]+vec2[0])/2, (vec1[1]+vec2[1])/2), xytext=(search_word_x + 0.05, search_word_y + 0.05), arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=5), fontsize=9, color='green', weight='bold') ax.set_title(f"2D Projection of '{search_word}' and its Nearest Neighbors") ax.set_xlabel(f"PCA Component 1 (explains {pca.explained_variance_ratio_[0]*100:.1f}%)") ax.set_ylabel(f"PCA Component 2 (explains {pca.explained_variance_ratio_[1]*100:.1f}%)") ax.grid(True, linestyle=':', alpha=0.6) ax.axhline(0, color='gray', linewidth=0.5) ax.axvline(0, color='gray', linewidth=0.5) ax.set_aspect('equal', adjustable='box') plt.tight_layout() # Format the DataFrame for better display in Gradio results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4) results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity message = f"Found nearest neighbors for '{search_word}'! " \ f"Red diamond is the search word, blue circles are its closest neighbors. " \ f"The angle annotation shows the angle between the search word and its closest neighbor." return fig, results_df, message # --- Gradio Interface --- iface = gr.Interface( fn=find_nearest_neighbors_and_plot, inputs=gr.Textbox( label="Enter a word to explore its neighbors:", placeholder="e.g., cat, king, fish" ), outputs=[ gr.Plot(label="Word Vector Visualization (PCA 2D)"), gr.DataFrame( headers=["Neighbor Word", "Similarity Score"], row_count=5, # Display up to 5 rows by default wrap=True, interactive=False, label="Nearest Neighbors" ), gr.Markdown( label="Status" ) ], title="🚀 Word Vector Explorer: Visualize & Understand Cosine Similarity!", description=( "Type a word to see its nearest semantic neighbors in the vector space, along with a 2D visualization! " "The angle between vectors on the plot is a visual representation of **Cosine Similarity** " "(smaller angle = higher similarity). " "
_Note: This POC uses dummy 4D word vectors projected to 2D using PCA. " "In a full version, this would connect to a large pre-trained Word2Vec model!_" ), allow_flagging="never", # Optional: disables the "Flag" button examples=[ ["cat"], ["king"], ["fish"], ["run"] ] ) if __name__ == "__main__": iface.launch()