File size: 6,337 Bytes
734777a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
### Example code
from pinecone.grpc import PineconeGRPC as Pinecone
import os
import pandas as pd
import numpy as np
from pinecone import ServerlessSpec
from pinecone_text.sparse import BM25Encoder
import sys
sys.path.append('src/python')
import DataLoader
####### VERY MINIMAL NONSENSE DATA
data = {
'id': ['vec1', 'vec2'],
'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]],
'metadata': [{'text': 'drama'}, {'text': 'action'}],
'sparse_indices': [[10, 45, 16], [12, 34, 56]],
'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]]
}
pc.create_index(
name="oc-hybrid-index",
dimension=3,
metric="dotproduct",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
index = pc.index("oc-hybrid-index")
vecs = create_sparse_dense_dict(df)
index.upsert(vecs, namespace="example-namespace")
######################## Indicator Test Data
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
pc.delete_index("oc-hybrid-index")
pc.create_index(
name="oc-hybrid-index",
dimension=1024,
metric="dotproduct",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
index = pc.Index("oc-hybrid-index")
## Upsert Indicator Test Data
df = pd.read_csv('data/Indicator_Test.csv')
## get top three rows
df = df.head(3)
# get text and MessageID
# Example usage
df = pd.read_csv('data/Indicator_Test.csv')
df = df.head(3)
bm25, newdf = create_sparse_embeds(df)
metadata = df[['text', 'label']].to_dict(orient='records')
newdf['metadata'] = metadata
vecs = create_sparse_dense_dict(newdf)
index.upsert(vecs, namespace="example-namespace")
## Query the hybrid index
querytext = "immigrants are invading the border"
queryembed = query_embed(pc, "multilingual-e5-large", querytext)
query_sparse_vector = bm25.encode_documents(querytext)
query_response = index.query(
top_k=1,
namespace="example-namespace",
vector=queryembed,
sparse_vector=query_sparse_vector,
include_metadata=True
)
## Now create embeddings
from pinecone import Pinecone
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
model = "multilingual-e5-large"
DataLoader.chunk_and_embed(pc, model, df)
#df['Embeddings'] = [np.random.random(4) for x in range(len(df))]
# rename embeddings to values
df.rename(columns={'Embeddings': 'values'}, inplace=True)
#df['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(df))]
## now, create metadata column to capture any column not including id, values, indices, and sparse_values
df['metadata'] = df.drop(columns=['id', 'values', 'indices', 'sparse_values']).to_dict(orient='records')
# only keep ids, values, metadata, indices, and sparse_values
df = df[['id', 'values', 'metadata', 'indices', 'sparse_values']]
vecs = create_sparse_dense_dict(df)
pc.create_index(
name="oc-hybrid-indexv2",
dimension=1024,
metric="dotproduct",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
index = pc.Index("oc-hybrid-indexv2")
index.upsert(vecs, namespace="example-namespace")
## QUERY
query_response = index.query(
top_k=10,
vector=[0.1, 0.2, 0.3],
sparse_vector={
'indices': [10, 45, 16],
'values': [0.5, 0.5, 0.2]
}
)
################
query = "test border patrol"
query_sparse_vector = encode_query(bm25, query)
query_response = index.query(
top_k=1,
namespace="example-namespace",
vector=np.random.random(1024).tolist(),
sparse_vector=query_sparse_vector
)
################
query = "ireland"
query_sparse_vector = encode_query(bm25, query)
query_response = index.query(
top_k=1,
namespace="example-namespace",
vector=np.random.random(1024).tolist(),
sparse_vector=query_sparse_vector
)
################## Function to create sparse and dense vectors
from tqdm.auto import tqdm
# Remove columns you dont want to encode
df = pd.read_csv('data/Indicator_Test.csv')
metadata = df
batch_size = 200
# convert all columns to string
metadata = metadata.astype(str)
#cols_to_remove = ['channelID', 'MessageID', 'AccountID', 'label', 'contexts', 'topics', 'weak topics', 'indicators']
for i in tqdm(range(0, len(df), batch_size)):
# find end of batch
i_end = min(i+batch_size, len(df))
# extract metadata batch
meta_batch = metadata.iloc[i:i_end]
meta_dict = meta_batch.to_dict(orient="records")
# concatenate all metadata field except for id and year to form a single string
meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()]
# extract image batch
img_batch = images[i:i_end]
# create sparse BM25 vectors
sparse_embeds = bm25.encode_documents([text for text in meta_batch])
# create dense vectors
dense_embeds = model.encode(img_batch).tolist()
# create unique IDs
ids = [str(x) for x in range(i, i_end)]
upserts = []
# loop through the data and create dictionaries for uploading documents to pinecone index
for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict):
upserts.append({
'id': _id,
'sparse_values': sparse,
'values': dense,
'metadata': meta
})
# upload the documents to the new hybrid index
index.upsert(upserts)
# Create an upsert function for hybrid vectors
def upsert_hybrid_vectors(index, df, model, bm25, batch_size=200, cols_to_remove=['id', 'year']):
metadata = df.remove_columns("image")
for i in tqdm(range(0, len(df), batch_size)):
i_end = min(i+batch_size, len(df))
meta_batch = metadata.iloc[i:i_end]
meta_dict = meta_batch.to_dict(orient="records")
meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()]
text_batch = df
sparse_embeds = bm25.encode_documents([text for text in meta_batch])
dense_embeds = model.encode(text_batch).tolist()
ids = [str(x) for x in range(i, i_end)]
upserts = []
for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict):
upserts.append({
'id': _id,
'sparse_values': sparse,
'values': dense,
'metadata': meta
})
index.upsert(upserts)
# show index description after uploading the documents
index.describe_index_stats() |