Spaces:

fractalz
/

LanguageGames2

Sleeping

File size: 8,023 Bytes

import gradio as gr
import numpy as np
from scipy.spatial.distance import cosine
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# --- Simulate a small pre-trained Word2Vec model ---
# Dummy word vectors for demonstration (4D for richer visualization)
dummy_word_vectors = {
    'cat': np.array([0.9, 0.7, 0.1, 0.2]),
    'dog': np.array([0.8, 0.8, 0.3, 0.1]),
    'kitten': np.array([0.85, 0.75, 0.15, 0.25]),
    'puppy': np.array([0.75, 0.85, 0.25, 0.15]),
    'fish': np.array([0.1, 0.2, 0.9, 0.8]),
    'bird': np.array([0.2, 0.1, 0.8, 0.9]),
    'ocean': np.array([0.05, 0.15, 0.95, 0.85]),
    'sky': np.array([0.25, 0.05, 0.85, 0.95]),
    'run': np.array([0.6, 0.3, 0.1, 0.1]),
    'walk': np.array([0.55, 0.35, 0.15, 0.05]),
    'jump': np.array([0.65, 0.25, 0.05, 0.15]),
    'king': np.array([0.9, 0.1, 0.1, 0.8]),
    'queen': np.array([0.8, 0.2, 0.2, 0.9]),
    'man': np.array([0.9, 0.15, 0.05, 0.7]),
    'woman': np.array([0.85, 0.1, 0.15, 0.85]),
    'prince': np.array([0.88, 0.12, 0.12, 0.82]),
    'princess': np.array([0.83, 0.18, 0.18, 0.88])
}

# Normalize vectors (important for cosine similarity)
for word, vec in dummy_word_vectors.items():
    dummy_word_vectors[word] = vec / np.linalg.norm(vec)

# --- Function to find nearest neighbors and generate plot ---
def find_nearest_neighbors_and_plot(search_word_input):
    search_word = search_word_input.lower()

    if search_word not in dummy_word_vectors:
        return (
            None, # No plot
            pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]),
            "Warning: Word not found!"
        )

    target_vector = dummy_word_vectors[search_word]
    similarities = []
    
    # Collect words and vectors for PCA
    words_to_plot = [search_word]
    vectors_to_plot = [target_vector]

    for word, vector in dummy_word_vectors.items():
        if word != search_word: # Don't compare a word to itself
            similarity = 1 - cosine(target_vector, vector)
            similarities.append({"Word": word, "Cosine Similarity": similarity})

    results_df = pd.DataFrame(similarities).sort_values(
        by="Cosine Similarity", ascending=False
    ).reset_index(drop=True)

    # Add top N neighbors to plot (e.g., top 5)
    top_n = 5
    for _, row in results_df.head(top_n).iterrows():
        words_to_plot.append(row["Word"])
        vectors_to_plot.append(dummy_word_vectors[row["Word"]])

    # Convert to numpy array for PCA
    vectors_array = np.array(vectors_to_plot)

    # Perform PCA to reduce to 2 dimensions for plotting
    pca = PCA(n_components=2)
    # Fit PCA on all dummy vectors first to get a consistent mapping
    # This helps keep the relative positions meaningful across different searches.
    all_vectors_array = np.array(list(dummy_word_vectors.values()))
    pca.fit(all_vectors_array)
    
    # Transform only the selected vectors
    transformed_vectors = pca.transform(vectors_array)

    # Create the plot
    fig, ax = plt.subplots(figsize=(8, 8))
    
    # Plot all words in the dummy vocabulary as light grey points
    # to provide some context for the PCA space
    all_transformed_vectors = pca.transform(all_vectors_array)
    all_words = list(dummy_word_vectors.keys())
    for i, word in enumerate(all_words):
        ax.scatter(all_transformed_vectors[i, 0], all_transformed_vectors[i, 1], 
                   color='lightgray', alpha=0.5, s=50)
        ax.text(all_transformed_vectors[i, 0] + 0.01, all_transformed_vectors[i, 1] + 0.01, word, 
                fontsize=8, color='darkgray')

    # Plot selected words
    for i, word in enumerate(words_to_plot):
        x, y = transformed_vectors[i]
        color = 'red' if word == search_word else 'blue'
        marker = 'D' if word == search_word else 'o' # Diamond for search word
        
        ax.scatter(x, y, color=color, label=word, marker=marker, s=150 if word == search_word else 100, edgecolor='black', zorder=5)
        ax.text(x + 0.01, y + 0.01, word, fontsize=10, weight='bold' if word == search_word else 'normal', color=color, zorder=6)
        
        # Draw vector from origin to point (simulating conceptual vectors)
        ax.plot([0, x], [0, y], color=color, linestyle='--', linewidth=1, alpha=0.7)

    # Draw arrows from search word to its neighbors (optional, but good for intuition)
    search_word_x, search_word_y = transformed_vectors[0]
    for i in range(1, len(transformed_vectors)):
        neighbor_x, neighbor_y = transformed_vectors[i]
        # Calculate angle and display for top 1
        if i == 1: # Only for the closest neighbor
            vec1 = transformed_vectors[0] - np.array([0,0]) # Vector from origin to search word
            vec2 = transformed_vectors[i] - np.array([0,0]) # Vector from origin to neighbor
            
            # Use original 4D vectors for actual cosine similarity calculation
            original_vec1 = target_vector
            original_vec2 = dummy_word_vectors[words_to_plot[i]]
            
            sim_val = 1 - cosine(original_vec1, original_vec2)
            angle_rad = np.arccos(np.clip(sim_val, -1.0, 1.0)) # Clip to handle potential float precision issues
            angle_deg = np.degrees(angle_rad)
            ax.annotate(f"{angle_deg:.1f}°", xy=((vec1[0]+vec2[0])/2, (vec1[1]+vec2[1])/2), 
                        xytext=(search_word_x + 0.05, search_word_y + 0.05),
                        arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=5),
                        fontsize=9, color='green', weight='bold')


    ax.set_title(f"2D Projection of '{search_word}' and its Nearest Neighbors")
    ax.set_xlabel(f"PCA Component 1 (explains {pca.explained_variance_ratio_[0]*100:.1f}%)")
    ax.set_ylabel(f"PCA Component 2 (explains {pca.explained_variance_ratio_[1]*100:.1f}%)")
    ax.grid(True, linestyle=':', alpha=0.6)
    ax.axhline(0, color='gray', linewidth=0.5)
    ax.axvline(0, color='gray', linewidth=0.5)
    ax.set_aspect('equal', adjustable='box')
    plt.tight_layout()

    # Format the DataFrame for better display in Gradio
    results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4)
    results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity

    message = f"Found nearest neighbors for '{search_word}'! " \
              f"Red diamond is the search word, blue circles are its closest neighbors. " \
              f"The angle annotation shows the angle between the search word and its closest neighbor."

    return fig, results_df, message

# --- Gradio Interface ---
iface = gr.Interface(
    fn=find_nearest_neighbors_and_plot,
    inputs=gr.Textbox(
        label="Enter a word to explore its neighbors:",
        placeholder="e.g., cat, king, fish"
    ),
    outputs=[
        gr.Plot(label="Word Vector Visualization (PCA 2D)"),
        gr.DataFrame(
            headers=["Neighbor Word", "Similarity Score"],
            row_count=5, # Display up to 5 rows by default
            wrap=True,
            interactive=False,
            label="Nearest Neighbors"
        ),
        gr.Markdown(
            label="Status"
        )
    ],
    title="🚀 Word Vector Explorer: Visualize & Understand Cosine Similarity!",
    description=(
        "Type a word to see its nearest semantic neighbors in the vector space, along with a 2D visualization! "
        "The angle between vectors on the plot is a visual representation of **Cosine Similarity** "
        "(smaller angle = higher similarity). "
        "<br>_Note: This POC uses dummy 4D word vectors projected to 2D using PCA. "
        "In a full version, this would connect to a large pre-trained Word2Vec model!_"
    ),
    allow_flagging="never", # Optional: disables the "Flag" button
    examples=[
        ["cat"],
        ["king"],
        ["fish"],
        ["run"]
    ]
)

if __name__ == "__main__":
    iface.launch()