Spaces:

fractalz
/

LanguageGames2

Sleeping

App Files Files Community

fractalz commited on 29 days ago

Commit

2435506

verified ·

1 Parent(s): 36ac795

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -11

app.py CHANGED Viewed

@@ -2,9 +2,11 @@ import gradio as gr
 import numpy as np
 from scipy.spatial.distance import cosine
 import pandas as pd
 # --- Simulate a small pre-trained Word2Vec model ---
-# Dummy word vectors for demonstration
 dummy_word_vectors = {
     'cat': np.array([0.9, 0.7, 0.1, 0.2]),
     'dog': np.array([0.8, 0.8, 0.3, 0.1]),
@@ -20,25 +22,33 @@ dummy_word_vectors = {
     'king': np.array([0.9, 0.1, 0.1, 0.8]),
     'queen': np.array([0.8, 0.2, 0.2, 0.9]),
     'man': np.array([0.9, 0.15, 0.05, 0.7]),
-    'woman': np.array([0.85, 0.1, 0.15, 0.85])
 }
 # Normalize vectors (important for cosine similarity)
 for word, vec in dummy_word_vectors.items():
     dummy_word_vectors[word] = vec / np.linalg.norm(vec)
-# --- Function to find nearest neighbors ---
-def find_nearest_neighbors(search_word_input):
     search_word = search_word_input.lower()
     if search_word not in dummy_word_vectors:
         return (
             pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]),
             "Warning: Word not found!"
         )
     target_vector = dummy_word_vectors[search_word]
     similarities = []
     for word, vector in dummy_word_vectors.items():
         if word != search_word: # Don't compare a word to itself
             similarity = 1 - cosine(target_vector, vector)
@@ -48,21 +58,100 @@ def find_nearest_neighbors(search_word_input):
         by="Cosine Similarity", ascending=False
     ).reset_index(drop=True)
     # Format the DataFrame for better display in Gradio
     results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4)
     results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity
-    message = f"Found nearest neighbors for '{search_word}'!"
-    return results_df, message
 # --- Gradio Interface ---
 iface = gr.Interface(
-    fn=find_nearest_neighbors,
     inputs=gr.Textbox(
         label="Enter a word to explore its neighbors:",
         placeholder="e.g., cat, king, fish"
     ),
     outputs=[
         gr.DataFrame(
             headers=["Neighbor Word", "Similarity Score"],
             row_count=5, # Display up to 5 rows by default
@@ -74,11 +163,13 @@ iface = gr.Interface(
             label="Status"
         )
     ],
-    title="🚀 Word Vector Explorer (Gradio POC)",
     description=(
-        "Discover the semantic neighbors of words using word embeddings! "
-        "Type a word, and see its closest companions in the vector space."
-        "<br>_Note: This POC uses dummy word vectors. In a full version, this would connect to a large pre-trained Word2Vec model!_"
     ),
     allow_flagging="never", # Optional: disables the "Flag" button
     examples=[

 import numpy as np
 from scipy.spatial.distance import cosine
 import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
 # --- Simulate a small pre-trained Word2Vec model ---
+# Dummy word vectors for demonstration (4D for richer visualization)
 dummy_word_vectors = {
     'cat': np.array([0.9, 0.7, 0.1, 0.2]),
     'dog': np.array([0.8, 0.8, 0.3, 0.1]),
     'king': np.array([0.9, 0.1, 0.1, 0.8]),
     'queen': np.array([0.8, 0.2, 0.2, 0.9]),
     'man': np.array([0.9, 0.15, 0.05, 0.7]),
+    'woman': np.array([0.85, 0.1, 0.15, 0.85]),
+    'prince': np.array([0.88, 0.12, 0.12, 0.82]),
+    'princess': np.array([0.83, 0.18, 0.18, 0.88])
 }
 # Normalize vectors (important for cosine similarity)
 for word, vec in dummy_word_vectors.items():
     dummy_word_vectors[word] = vec / np.linalg.norm(vec)
+# --- Function to find nearest neighbors and generate plot ---
+def find_nearest_neighbors_and_plot(search_word_input):
     search_word = search_word_input.lower()
     if search_word not in dummy_word_vectors:
         return (
+            None, # No plot
             pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]),
             "Warning: Word not found!"
         )
     target_vector = dummy_word_vectors[search_word]
     similarities = []
+    # Collect words and vectors for PCA
+    words_to_plot = [search_word]
+    vectors_to_plot = [target_vector]
     for word, vector in dummy_word_vectors.items():
         if word != search_word: # Don't compare a word to itself
             similarity = 1 - cosine(target_vector, vector)
         by="Cosine Similarity", ascending=False
     ).reset_index(drop=True)
+    # Add top N neighbors to plot (e.g., top 5)
+    top_n = 5
+    for _, row in results_df.head(top_n).iterrows():
+        words_to_plot.append(row["Word"])
+        vectors_to_plot.append(dummy_word_vectors[row["Word"]])
+    # Convert to numpy array for PCA
+    vectors_array = np.array(vectors_to_plot)
+    # Perform PCA to reduce to 2 dimensions for plotting
+    pca = PCA(n_components=2)
+    # Fit PCA on all dummy vectors first to get a consistent mapping
+    # This helps keep the relative positions meaningful across different searches.
+    all_vectors_array = np.array(list(dummy_word_vectors.values()))
+    pca.fit(all_vectors_array)
+    # Transform only the selected vectors
+    transformed_vectors = pca.transform(vectors_array)
+    # Create the plot
+    fig, ax = plt.subplots(figsize=(8, 8))
+    # Plot all words in the dummy vocabulary as light grey points
+    # to provide some context for the PCA space
+    all_transformed_vectors = pca.transform(all_vectors_array)
+    all_words = list(dummy_word_vectors.keys())
+    for i, word in enumerate(all_words):
+        ax.scatter(all_transformed_vectors[i, 0], all_transformed_vectors[i, 1],
+                   color='lightgray', alpha=0.5, s=50)
+        ax.text(all_transformed_vectors[i, 0] + 0.01, all_transformed_vectors[i, 1] + 0.01, word,
+                fontsize=8, color='darkgray')
+    # Plot selected words
+    for i, word in enumerate(words_to_plot):
+        x, y = transformed_vectors[i]
+        color = 'red' if word == search_word else 'blue'
+        marker = 'D' if word == search_word else 'o' # Diamond for search word
+        ax.scatter(x, y, color=color, label=word, marker=marker, s=150 if word == search_word else 100, edgecolor='black', zorder=5)
+        ax.text(x + 0.01, y + 0.01, word, fontsize=10, weight='bold' if word == search_word else 'normal', color=color, zorder=6)
+        # Draw vector from origin to point (simulating conceptual vectors)
+        ax.plot([0, x], [0, y], color=color, linestyle='--', linewidth=1, alpha=0.7)
+    # Draw arrows from search word to its neighbors (optional, but good for intuition)
+    search_word_x, search_word_y = transformed_vectors[0]
+    for i in range(1, len(transformed_vectors)):
+        neighbor_x, neighbor_y = transformed_vectors[i]
+        # Calculate angle and display for top 1
+        if i == 1: # Only for the closest neighbor
+            vec1 = transformed_vectors[0] - np.array([0,0]) # Vector from origin to search word
+            vec2 = transformed_vectors[i] - np.array([0,0]) # Vector from origin to neighbor
+            # Use original 4D vectors for actual cosine similarity calculation
+            original_vec1 = target_vector
+            original_vec2 = dummy_word_vectors[words_to_plot[i]]
+            sim_val = 1 - cosine(original_vec1, original_vec2)
+            angle_rad = np.arccos(np.clip(sim_val, -1.0, 1.0)) # Clip to handle potential float precision issues
+            angle_deg = np.degrees(angle_rad)
+            ax.annotate(f"{angle_deg:.1f}°", xy=((vec1[0]+vec2[0])/2, (vec1[1]+vec2[1])/2),
+                        xytext=(search_word_x + 0.05, search_word_y + 0.05),
+                        arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=5),
+                        fontsize=9, color='green', weight='bold')
+    ax.set_title(f"2D Projection of '{search_word}' and its Nearest Neighbors")
+    ax.set_xlabel(f"PCA Component 1 (explains {pca.explained_variance_ratio_[0]*100:.1f}%)")
+    ax.set_ylabel(f"PCA Component 2 (explains {pca.explained_variance_ratio_[1]*100:.1f}%)")
+    ax.grid(True, linestyle=':', alpha=0.6)
+    ax.axhline(0, color='gray', linewidth=0.5)
+    ax.axvline(0, color='gray', linewidth=0.5)
+    ax.set_aspect('equal', adjustable='box')
+    plt.tight_layout()
     # Format the DataFrame for better display in Gradio
     results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4)
     results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity
+    message = f"Found nearest neighbors for '{search_word}'! " \
+              f"Red diamond is the search word, blue circles are its closest neighbors. " \
+              f"The angle annotation shows the angle between the search word and its closest neighbor."
+    return fig, results_df, message
 # --- Gradio Interface ---
 iface = gr.Interface(
+    fn=find_nearest_neighbors_and_plot,
     inputs=gr.Textbox(
         label="Enter a word to explore its neighbors:",
         placeholder="e.g., cat, king, fish"
     ),
     outputs=[
+        gr.Plot(label="Word Vector Visualization (PCA 2D)"),
         gr.DataFrame(
             headers=["Neighbor Word", "Similarity Score"],
             row_count=5, # Display up to 5 rows by default
             label="Status"
         )
     ],
+    title="🚀 Word Vector Explorer: Visualize & Understand Cosine Similarity!",
     description=(
+        "Type a word to see its nearest semantic neighbors in the vector space, along with a 2D visualization! "
+        "The angle between vectors on the plot is a visual representation of **Cosine Similarity** "
+        "(smaller angle = higher similarity). "
+        "<br>_Note: This POC uses dummy 4D word vectors projected to 2D using PCA. "
+        "In a full version, this would connect to a large pre-trained Word2Vec model!_"
     ),
     allow_flagging="never", # Optional: disables the "Flag" button
     examples=[