Spaces:
Sleeping
Sleeping
### Example code | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
import os | |
import pandas as pd | |
import numpy as np | |
from pinecone import ServerlessSpec | |
from pinecone_text.sparse import BM25Encoder | |
import sys | |
sys.path.append('src/python') | |
import DataLoader | |
####### VERY MINIMAL NONSENSE DATA | |
data = { | |
'id': ['vec1', 'vec2'], | |
'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]], | |
'metadata': [{'text': 'drama'}, {'text': 'action'}], | |
'sparse_indices': [[10, 45, 16], [12, 34, 56]], | |
'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]] | |
} | |
pc.create_index( | |
name="oc-hybrid-index", | |
dimension=3, | |
metric="dotproduct", | |
spec=ServerlessSpec( | |
cloud="aws", | |
region="us-east-1" | |
) | |
) | |
index = pc.index("oc-hybrid-index") | |
vecs = create_sparse_dense_dict(df) | |
index.upsert(vecs, namespace="example-namespace") | |
######################## Indicator Test Data | |
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") | |
pc.delete_index("oc-hybrid-index") | |
pc.create_index( | |
name="oc-hybrid-index", | |
dimension=1024, | |
metric="dotproduct", | |
spec=ServerlessSpec( | |
cloud="aws", | |
region="us-east-1" | |
) | |
) | |
index = pc.Index("oc-hybrid-index") | |
## Upsert Indicator Test Data | |
df = pd.read_csv('data/Indicator_Test.csv') | |
## get top three rows | |
df = df.head(3) | |
# get text and MessageID | |
# Example usage | |
df = pd.read_csv('data/Indicator_Test.csv') | |
df = df.head(3) | |
bm25, newdf = create_sparse_embeds(df) | |
metadata = df[['text', 'label']].to_dict(orient='records') | |
newdf['metadata'] = metadata | |
vecs = create_sparse_dense_dict(newdf) | |
index.upsert(vecs, namespace="example-namespace") | |
## Query the hybrid index | |
querytext = "immigrants are invading the border" | |
queryembed = query_embed(pc, "multilingual-e5-large", querytext) | |
query_sparse_vector = bm25.encode_documents(querytext) | |
query_response = index.query( | |
top_k=1, | |
namespace="example-namespace", | |
vector=queryembed, | |
sparse_vector=query_sparse_vector, | |
include_metadata=True | |
) | |
## Now create embeddings | |
from pinecone import Pinecone | |
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") | |
model = "multilingual-e5-large" | |
DataLoader.chunk_and_embed(pc, model, df) | |
#df['Embeddings'] = [np.random.random(4) for x in range(len(df))] | |
# rename embeddings to values | |
df.rename(columns={'Embeddings': 'values'}, inplace=True) | |
#df['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(df))] | |
## now, create metadata column to capture any column not including id, values, indices, and sparse_values | |
df['metadata'] = df.drop(columns=['id', 'values', 'indices', 'sparse_values']).to_dict(orient='records') | |
# only keep ids, values, metadata, indices, and sparse_values | |
df = df[['id', 'values', 'metadata', 'indices', 'sparse_values']] | |
vecs = create_sparse_dense_dict(df) | |
pc.create_index( | |
name="oc-hybrid-indexv2", | |
dimension=1024, | |
metric="dotproduct", | |
spec=ServerlessSpec( | |
cloud="aws", | |
region="us-east-1" | |
) | |
) | |
index = pc.Index("oc-hybrid-indexv2") | |
index.upsert(vecs, namespace="example-namespace") | |
## QUERY | |
query_response = index.query( | |
top_k=10, | |
vector=[0.1, 0.2, 0.3], | |
sparse_vector={ | |
'indices': [10, 45, 16], | |
'values': [0.5, 0.5, 0.2] | |
} | |
) | |
################ | |
query = "test border patrol" | |
query_sparse_vector = encode_query(bm25, query) | |
query_response = index.query( | |
top_k=1, | |
namespace="example-namespace", | |
vector=np.random.random(1024).tolist(), | |
sparse_vector=query_sparse_vector | |
) | |
################ | |
query = "ireland" | |
query_sparse_vector = encode_query(bm25, query) | |
query_response = index.query( | |
top_k=1, | |
namespace="example-namespace", | |
vector=np.random.random(1024).tolist(), | |
sparse_vector=query_sparse_vector | |
) | |
################## Function to create sparse and dense vectors | |
from tqdm.auto import tqdm | |
# Remove columns you dont want to encode | |
df = pd.read_csv('data/Indicator_Test.csv') | |
metadata = df | |
batch_size = 200 | |
# convert all columns to string | |
metadata = metadata.astype(str) | |
#cols_to_remove = ['channelID', 'MessageID', 'AccountID', 'label', 'contexts', 'topics', 'weak topics', 'indicators'] | |
for i in tqdm(range(0, len(df), batch_size)): | |
# find end of batch | |
i_end = min(i+batch_size, len(df)) | |
# extract metadata batch | |
meta_batch = metadata.iloc[i:i_end] | |
meta_dict = meta_batch.to_dict(orient="records") | |
# concatenate all metadata field except for id and year to form a single string | |
meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()] | |
# extract image batch | |
img_batch = images[i:i_end] | |
# create sparse BM25 vectors | |
sparse_embeds = bm25.encode_documents([text for text in meta_batch]) | |
# create dense vectors | |
dense_embeds = model.encode(img_batch).tolist() | |
# create unique IDs | |
ids = [str(x) for x in range(i, i_end)] | |
upserts = [] | |
# loop through the data and create dictionaries for uploading documents to pinecone index | |
for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict): | |
upserts.append({ | |
'id': _id, | |
'sparse_values': sparse, | |
'values': dense, | |
'metadata': meta | |
}) | |
# upload the documents to the new hybrid index | |
index.upsert(upserts) | |
# Create an upsert function for hybrid vectors | |
def upsert_hybrid_vectors(index, df, model, bm25, batch_size=200, cols_to_remove=['id', 'year']): | |
metadata = df.remove_columns("image") | |
for i in tqdm(range(0, len(df), batch_size)): | |
i_end = min(i+batch_size, len(df)) | |
meta_batch = metadata.iloc[i:i_end] | |
meta_dict = meta_batch.to_dict(orient="records") | |
meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()] | |
text_batch = df | |
sparse_embeds = bm25.encode_documents([text for text in meta_batch]) | |
dense_embeds = model.encode(text_batch).tolist() | |
ids = [str(x) for x in range(i, i_end)] | |
upserts = [] | |
for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict): | |
upserts.append({ | |
'id': _id, | |
'sparse_values': sparse, | |
'values': dense, | |
'metadata': meta | |
}) | |
index.upsert(upserts) | |
# show index description after uploading the documents | |
index.describe_index_stats() |