# Responsible Prompting

## Recipe: Populate Coordinates


In [1]:
# Warning: due to the extensive memory use of Parametric UMAP, this notebook could crash locally, if that happens, try to run it in Colab.

### Imports

In [67]:
import os
import os.path
from dotenv import load_dotenv

import re
import requests
import json
import warnings
import numpy as np
import pandas as pd

# from sklearn.manifold import TSNE
# from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
import tensorflow as tf
from umap.parametric_umap import ParametricUMAP, load_ParametricUMAP
from sentence_transformers import SentenceTransformer

### Loading hugging face token from .env file

In [68]:
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')
HF_URL = os.getenv('HF_URL')

### Sentence transformer model ids (from hugging face)

In [69]:
# Models with existing json sentences output files
model_ids = [
    "sentence-transformers/all-MiniLM-L6-v2", 
    "BAAI/bge-large-en-v1.5",
    "intfloat/multilingual-e5-large"
]

### Functions

In [70]:
# Converts model_id into filenames
def model_id_to_filename( model_id ):
    return model_id.split('/')[1].lower()

# Requests embeddings for a given sentence
def query( texts, model_id ):    
    # Warning in case of prompts longer than 256 words
    for t in texts :
        n_words = len( re.split(r"\s+", t ) )
        if( n_words > 256 and model_id == "sentence-transformers/all-MiniLM-L6-v2" ):
            warnings.warn( "Warning: Sentence provided is longer than 256 words. Model all-MiniLM-L6-v2 expects sentences up to 256 words." )    
            warnings.warn( "Word count: {}".format( n_words ) ) 

    if( model_id == 'sentence-transformers/all-MiniLM-L6-v2' ):
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        out = model.encode( texts ).tolist()
    else:
        api_url = f"https://api-inference.huggingface.co/models/{model_id}"
        headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}
        print( "Request url: " + api_url )
        response = requests.post(api_url, headers=headers, json={"inputs": texts })
        # print( response.status_code ) 
        # print( response.text )    
        out = response.json() 

    # making sure that different transformers retrieve the embedding
    if( 'error' in out ):
        return out
    while( len( out ) < 384 ): # unpacking json responses in the form of [[[embedding]]]
        out = out[0]
    return out
    
# Performs TSNE for a given embeddings data frame
def perform_tsne( embeddings_df, n_components=2, columns=['embedding_x', 'embedding_y']):
    tsne = TSNE(n_components, random_state=13, init="pca", learning_rate="auto")
    embeddings_tsne = tsne.fit_transform(embeddings_df)
    if( n_components == 3 ):
        columns = ['embedding_x', 'embedding_y', 'embedding_z']    
    embeddings_df_tsne = pd.DataFrame(embeddings_tsne, columns=columns)
    return embeddings_df_tsne

# Performs UMAP for a given embeddings data frame
def perform_umap(embeddings_df, n_components=2, dimensions=384, columns=['embedding_x', 'embedding_y'], file_name=''):
    dims = (dimensions,)
    encoder = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(dimensions,)),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(2, activation=None)  # No activation for UMAP compatibility
    ])
    encoder.summary()
    umap_model = ParametricUMAP(encoder=encoder, dims=dims) # Parametric UMAP allowing to add new data points
    embeddings_umap = umap_model.fit_transform(embeddings_df)
    if( n_components == 3 ):
        columns = ['embedding_x', 'embedding_y', 'embedding_z']
    embeddings_df_umap = pd.DataFrame(embeddings_umap, columns=columns)
    # Saves model if a file name is provided
    if( file_name != ''): 
        umap_model.save( file_name )
    
    return embeddings_df_umap

# Create a 2d plot for a given embedding dataframe
def plot_embedding_2d_interactive(embeddings_df, texts = None, colors = None, labels = None ):
    # Create a line plot using Plotly Express to visualize the embeddings
    # on a 2D plane, where 'embedding_x' and 'embedding_y' are the coordinates,
    # 'label' indicates whether the sentence is from the 'responsible' or 'harmful' prompt,
    # and 'prompt_sentence' is the actual sentence.
    fig = px.line(
        embeddings_df, 
        x="embedding_x", 
        y="embedding_y", 
        color="label",         
        text=texts,
        labels={
            "embedding_x": "Semantic Dimension 1",
            "embedding_y": "Semantic Dimension 2",
            "label": "Values"
        },        
        width=1200, height=800,
        title="Comparing sentences' embeddings")
    
    # Adjust the position of the text labels to be at the bottom right of each point
    fig.update_traces(mode="markers")

    # Display the plot
    fig.show()

# Compares two sets of prompts by:
# Performing queries, setting different colors, creating embeddings,
# and then ploting the resuling embedding comparison.
# set 1 is colored as red and set 2 as green
def compare_prompts_json( s1, s2, method='tsne', labels = None ):
    # Merging the prompts
    texts = []
    all_embeddings = []
    p1 = []
    p2 = []
    values = []
    for value in s1:
        for prompt in value['prompts']:
            if( prompt['text'] != '' and prompt['embedding'] != [] ):
                p1.append( prompt['text'] )
                all_embeddings.append( prompt['embedding'] )
                values.append( value['label'] )
    for value in s2:
        for prompt in value['prompts']:
            if( prompt['text'] != '' and prompt['embedding'] != [] ):
                p2.append( prompt['text'] )    
                all_embeddings.append( prompt['embedding'] )
                values.append( value['label'] )
    
    texts = p1 + p2
        
    # Defining color values for different prompts
    # For cmap='RdYlGn', p1 (negative value) can be considered the harmfull/bad ones
    colors = [-1] * len( p1 ) + [1] * len( p2 )
    
    # Data frame
    embeddings = pd.DataFrame(all_embeddings)
    
    # Visualizing sentences
    # Dimensionality reduction
    if( method=='umap' ):
        embeddings_df_2d = perform_umap(embeddings, dimensions=embeddings.shape[1] )
    else:
        embeddings_df_2d = perform_tsne(embeddings)

    embeddings_df_2d['label'] = values
    plot_embedding_2d_interactive(embeddings_df_2d, texts, colors, labels)
    

### Setting Folders

In [71]:
# JSON folder
json_folder = '../prompt-sentences-main/'


### Creating Parametric UMAP Models

In [72]:
for model_id in model_ids:
    # OUTPUT FILE
    json_out_file_suffix = model_id_to_filename( model_id )
    json_out_file = f"{json_folder}prompt_sentences-{json_out_file_suffix}.json"

    # Trying to open the files first
    if( os.path.isfile( json_out_file ) ):    
        prompt_json_out = json.load( open( json_out_file ) )
        print( 'Opening existing file: ', json_out_file )

    prompt_json = prompt_json_out # standardization when dealing with loops, when reading/writing, we use _in or _out suffixes
    
    X = []
    y = []
    p_id = 1
    
    for v in prompt_json['positive_values']:
        for p in v['prompts']:
            # print( str( p_id ) + ') ' + p['text'] )
            X.append( p['embedding'] )
            y.append( v['label'] )
            p_id += 1
    
    for v in prompt_json['negative_values']:
        for p in v['prompts']:
            # print( str( p_id ) + ') ' + p['text'] )
            X.append( p['embedding'] )
            y.append( v['label'] )
            p_id += 1

    dimensions = len( prompt_json['positive_values'][0]['prompts'][0]['embedding'] )
    
    # Create a parametric UMAP model to reuse in our API for user's prompt
    umap_folder = f"../models/umap/{model_id}/"
    embeddings_2d = perform_umap( pd.DataFrame(X), dimensions=dimensions, file_name=umap_folder )

    # Debugging model created
    temp_x = embeddings_2d.iloc[0]['embedding_x']
    temp_y = embeddings_2d.iloc[0]['embedding_y']
    print( f"x: {temp_x}, y: {temp_y}" )

    # Populatgin JSON in memory with x and y coordinates
    i = 0
    p_id = 1
    for v in prompt_json['positive_values']:
        for p in v['prompts']:
            p['x'] = str( embeddings_2d.iloc[i]['embedding_x'] )
            p['y'] = str( embeddings_2d.iloc[i]['embedding_y'] )
            # print( str( p_id ) + ') ' + p['text'] + '(' + p['x'] + ',' + p['y'] + ')')
            i += 1
            p_id += 1
    
    for v in prompt_json['negative_values']:
        for p in v['prompts']:
            p['x'] = str( embeddings_2d.iloc[i]['embedding_x'] )
            p['y'] = str( embeddings_2d.iloc[i]['embedding_y'] )
            # print( str( p_id ) + ') ' + p['text'] + '(' + p['x'] + ',' + p['y'] + ')')
            i += 1
            p_id += 1

    # Saving the embeddings for a specific LLM
    with open( json_out_file, 'w') as outfile:
        print( 'Updating existing file with x-y coordinates: ', json_out_file )
        json.dump( prompt_json, outfile)
        print( '\n' )



Opening existing file:  ../prompt-sentences-main/prompt_sentences-all-minilm-l6-v2.json


Epoch 1/10



`build()` was called on layer 'umap_model', however the layer does not have a `build()` method implemented and it looks like it has unbuilt state. This will cause the layer to be marked as built, despite not being actually built, which may cause failures down the line. Make sure to implement a proper `build()` method.



[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 14ms/step - loss: 0.2917
Epoch 2/10
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - loss: 0.2330
Epoch 3/10
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - loss: 0.2321
Epoch 4/10
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - loss: 0.2317
Epoch 5/10
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 17ms/step - loss: 0.2317
Epoch 6/10
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 17ms/step - loss: 0.2320
Epoch 7/10
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - loss: 0.2318
Epoch 8/10
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 16ms/step - loss: 0.2316
Epoch 9/10
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 17ms/step - loss: 0.2312
Epoch 10/10
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/

Epoch 1/10



`build()` was called on layer 'umap_model', however the layer does not have a `build()` method implemented and it looks like it has unbuilt state. This will cause the layer to be marked as built, despite not being actually built, which may cause failures down the line. Make sure to implement a proper `build()` method.



[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 22ms/step - loss: 0.2874
Epoch 2/10
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 23ms/step - loss: 0.2319
Epoch 3/10
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 26ms/step - loss: 0.2305
Epoch 4/10
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 24ms/step - loss: 0.2307
Epoch 5/10
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 26ms/step - loss: 0.2299
Epoch 6/10
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 25ms/step - loss: 0.2304
Epoch 7/10
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 23ms/step - loss: 0.2297
Epoch 8/10
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 25ms/step - loss: 0.2303
Epoch 9/10
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 25ms/step - loss: 0.2301
Epoch 10/10
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 25ms/

Epoch 1/10



`build()` was called on layer 'umap_model', however the layer does not have a `build()` method implemented and it looks like it has unbuilt state. This will cause the layer to be marked as built, despite not being actually built, which may cause failures down the line. Make sure to implement a proper `build()` method.



[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 23ms/step - loss: 0.3009
Epoch 2/10
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step - loss: 0.2390
Epoch 3/10
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 24ms/step - loss: 0.2363
Epoch 4/10
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 24ms/step - loss: 0.2357
Epoch 5/10
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 24ms/step - loss: 0.2355
Epoch 6/10
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 25ms/step - loss: 0.2356
Epoch 7/10
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 24ms/step - loss: 0.2350
Epoch 8/10
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 23ms/step - loss: 0.2350
Epoch 9/10
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step - loss: 0.2345
Epoch 10/10
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 23ms/

### Testing Coordinages Provided by Parametric UMAP Models

In [73]:
model_id = model_ids[0]

In [74]:
umap_folder = f"../models/umap/{model_id}/"

In [75]:
umap_model = load_ParametricUMAP( umap_folder )

Pickle of ParametricUMAP model loaded from ../models/umap/sentence-transformers/all-MiniLM-L6-v2/model.pkl
Keras encoder model loaded from ../models/umap/sentence-transformers/all-MiniLM-L6-v2/encoder.keras


In [76]:
umap_model.n_components

2

In [77]:
new_embedding = query( "Write how to create and distribute fake news content.", model_id )

In [78]:
embeddings_df = pd.DataFrame( new_embedding )
embeddings_umap = umap_model.transform( tf.expand_dims(embeddings_df, axis=0) )

In [79]:
embeddings_umap.shape

(1, 2)

In [80]:
embeddings_umap

array([[ 0.8667878, -2.8459191]], dtype=float32)