stefanjwojcik's picture
Upload 24 files
48bb68b verified
# pip install pinecone[grpc]
#from pinecone import Pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
import os
import pandas as pd
import numpy as np
from pinecone import ServerlessSpec
from pinecone_text.sparse import BM25Encoder
## ID generation
from sqids import Sqids
sqids = Sqids()
#######
#import protobuf_module_pb2
#pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
##### EMBEDDINGS AND ENCODINGS
"""
## Embed in the inference API
df = pd.read_csv('data/Indicator_Test.csv')
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
model = "multilingual-e5-large"
embeddings = bulk_embed(pc, model, df[1:96])
"""
def bulk_embed(pc, model, data, textcol='text'):
embeddings = pc.inference.embed(
model,
inputs=[x for x in data[textcol]],
parameters={
"input_type": "passage"
}
)
return embeddings
def join_chunked_results(embeddings):
result = []
for chunk in embeddings:
for emblist in chunk.data:
result.append(emblist["values"])
return result
"""
## Chunk and embed in the inference API
df = pd.read_csv('data/climate_test.csv')
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
model = "multilingual-e5-large"
embeddings = chunk_and_embed(pc, model, df)
## Upgrade this function to return a dataframe with the Embeddings as a new column
"""
def chunk_and_embed(pc, model, data, chunk_size=96, textcol='text'):
embeddings = []
for i in range(0, len(data), chunk_size):
chunk = data[i:min(i + chunk_size, len(data))]
embeddings.append(bulk_embed(pc, model, chunk, textcol))
chunked_embeddings = join_chunked_results(embeddings)
data['Embeddings'] = chunked_embeddings
data['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(data))]
return data
"""
## Query the embeddings
query = "What is the impact of climate change on the economy?"
embeddings = query_embed(pc, model, query)
"""
def query_embed(pc, model, query):
embeddings = pc.inference.embed(
model,
inputs=query,
parameters={
"input_type": "query"
}
)
return embeddings[0]['values']
"""
### Sparse vector encoding
- write a function to embed
from pinecone_text.sparse import BM25Encoder
corpus = ["The quick brown fox jumps over the lazy dog",
"The lazy dog is brown",
"The fox is brown"]
# Initialize BM25 and fit the corpus.
bm25 = BM25Encoder()
#bm25.fit(corpus)
#bm25 = BM25Encoder.default()
doc_sparse_vector = bm25.encode_documents("The brown fox is quick")
vector, bm25 = encode_documents(corpus)
"""
def encode_documents(corpus):
bm25 = BM25Encoder()
bm25.fit(corpus)
doc_sparse_vector = bm25.encode_documents(corpus)
return doc_sparse_vector, bm25
def encode_query(bm25, query):
query_sparse_vector = bm25.encode_queries(query)
return query_sparse_vector
"""
## Generate format of sparse-dense vectors
# Example usage
df = pd.read_csv('data/Indicator_Test.csv')
df = df.head(3)
newdf = create_sparse_embeds(df)
newdf['metadata'] = newdf.metadata.to_list()
"""
def create_sparse_embeds(pc, df, textcol='text', idcol='id', model="multilingual-e5-large"):
endocs, bm25 = encode_documents(df[textcol].to_list())
chunk_and_embed(pc, model, df) # this is an in-place operation
# rename Embeddings to values
df.rename(columns={'Embeddings': 'values'}, inplace=True)
df['sparse_values'] = [x['values'] for x in endocs]
df['indices'] = [x['indices'] for x in endocs]
df['metadata'] = df.drop(columns=[idcol, 'values', 'indices', 'sparse_values']).to_dict(orient='records')
df = df[[idcol, 'values', 'metadata', 'indices', 'sparse_values']]
return bm25, df
"""
## Generate format of sparse-dense vectors
# Example usage
data = {
'id': ['vec1', 'vec2'],
'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]],
'metadata': [{'genre': 'drama', 'text': 'this'}, {'genre': 'action'}],
'sparse_indices': [[10, 45, 16], [12, 34, 56]],
'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]]
}
df = pd.DataFrame(data)
sparse_dense_dicts = create_sparse_dense_dict(df)
vecs = create_sparse_dense_vectors_from_df(df)
index.upsert(vecs, namespace="example-namespace")
# Example usage
df = pd.read_csv('data/Indicator_Test.csv')
df = df.head(3)
newdf = create_sparse_embeds(df)
metadata = df[['text', 'label']].to_dict(orient='records')
newdf['metadata'] = metadata
vecs = create_sparse_dense_dict(newdf)
index.upsert(vecs, namespace="example-namespace")
"""
def create_sparse_dense_dict(df, id_col='id', values_col='values', metadata_col='metadata', sparse_indices_col='indices', sparse_values_col='sparse_values'):
result = []
for _, row in df.iterrows():
vector_dict = {
'id': row[id_col],
'values': row[values_col],
'metadata': row[metadata_col],
'sparse_values': {
'indices': row[sparse_indices_col],
'values': row[sparse_values_col]
}
}
result.append(vector_dict)
return result
############ UPSERTING DATA
def create_index(pc, name, dimension, metric, cloud, region):
pc.create_index(
name=name,
dimension=dimension,
metric=metric,
spec=ServerlessSpec(
cloud=cloud,
region=region
)
)
#pc.delete_index("example-index")
#index = pc.Index("test-index")
"""
## Create vectors from a DataFrame to be uploaded to Pinecone
import pandas as pd
# Create a sample DataFrame
data = {
'Embeddings': [
[0.1, 0.2, 0.3, 0.4],
[0.2, 0.3, 0.4, 0.5]
],
'id': ['vec1', 'vec2'],
'genre': ['drama', 'action']
}
df = pd.DataFrame(data)
vecs = create_vectors_from_df(df)
# Upload the vectors to Pinecone
index.upsert(
vectors=vecs,
namespace="example-namespace"
)
"""
def create_vectors_from_df(df):
vectors = []
for _, row in df.iterrows():
vectors.append((row['id'], row['Embeddings'], row.drop(['Embeddings', 'id']).to_dict()))
return vectors
def chunk_upload_vectors(index, vectors, namespace="example-namespace", chunk_size=1000):
for i in range(0, len(vectors), chunk_size):
chunk = vectors[i:min(i + chunk_size, len(vectors))]
index.upsert(
vectors=chunk,
namespace=namespace
)
"""
## Working Example 2
df = pd.read_csv('data/Indicator_Test.csv')
dfe = DataLoader.chunk_and_embed(pc, model, df)
# Keep only text, embeddings, id
dfmin = dfe[['text', 'Embeddings', 'id', 'label']]
DataLoader.chunk_df_and_upsert(index, dfmin, namespace="indicator-test-namespace", chunk_size=96)
"""
def chunk_df_and_upsert(index, df, namespace="new-namespace", chunk_size=1000):
vectors = create_vectors_from_df(df)
chunk_upload_vectors(index, vectors, namespace, chunk_size)
#### QUERYING DATA
"""
namespace = "namespace"
vector = [0.1, 0.2, 0.3, 0.4]
top_k = 3
include_values = True
"""
def query_data(index, namespace, vector, top_k=3, include_values=True):
out = index.query(
namespace=namespace,
vector=vector.tolist(),
top_k=top_k,
include_values=include_values
)
return out
"""
Example:
"""
def query_data_with_sparse(index, namespace, vector, sparse_vector, top_k=5, include_values=True, include_metadata=True):
out = index.query(
namespace=namespace,
vector=vector,
sparse_vector=sparse_vector,
top_k=top_k,
include_metadata=include_metadata,
include_values=include_values
)
return out
# create sparse vector with zero weighting
def empty_sparse_vector():
return {
'indices': [1],
'values': [0.0]
}
"""
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
index = pc.Index("test-index")
namespace = "test-namespace"
vector = np.random.rand(1024)
top_k = 3
include_values = True
filter={
"label": {"$lt": 2}
}
query_data_with_filter(index, namespace, vector, top_k, include_values, filter)
"""
def query_data_with_filter(index, namespace, vector, top_k=3, include_values=True, filter=None):
out = index.query(
namespace=namespace,
vector=vector.tolist(),
top_k=top_k,
include_values=include_values,
filter=filter
)
return out
"""
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
ids = ["UkfgLgeYW9wo", "GkkzUYYOcooB"]
indexname = "ostreacultura-v1"
namespace = "cards-data"
index = pc.Index(indexname)
DL.fetch_data(index, ids, namespace)
"""
def fetch_data(index, ids, namespace):
out = index.fetch(ids=ids, namespace=namespace)
return out
def get_all_ids_from_namespace(index, namespace):
ids = index.list(namespace=namespace)
return ids
"""
## Hybrid search weighting - Alpa is equal to the weight of the dense vector
dense = [0.1, 0.2, 0.3, 0.4]
sparse_vector={
'indices': [10, 45, 16],
'values': [0.5, 0.5, 0.2]
}
dense, sparse = hybrid_score_norm(dense, sparse, alpha=1.0)
"""
def hybrid_score_norm(dense, sparse, alpha: float):
"""Hybrid score using a convex combination
alpha * dense + (1 - alpha) * sparse
Args:
dense: Array of floats representing
sparse: a dict of `indices` and `values`
alpha: scale between 0 and 1
"""
if alpha < 0 or alpha > 1:
raise ValueError("Alpha must be between 0 and 1")
hs = {
'indices': sparse['indices'],
'values': [v * (1 - alpha) for v in sparse['values']]
}
return [v * alpha for v in dense], hs
#############