|
|
|
from pinecone.grpc import PineconeGRPC as Pinecone |
|
import os |
|
import pandas as pd |
|
import numpy as np |
|
from pinecone import ServerlessSpec |
|
from pinecone_text.sparse import BM25Encoder |
|
import sys |
|
sys.path.append('src/python') |
|
import DataLoader |
|
|
|
|
|
data = { |
|
'id': ['vec1', 'vec2'], |
|
'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]], |
|
'metadata': [{'text': 'drama'}, {'text': 'action'}], |
|
'sparse_indices': [[10, 45, 16], [12, 34, 56]], |
|
'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]] |
|
} |
|
|
|
pc.create_index( |
|
name="oc-hybrid-index", |
|
dimension=3, |
|
metric="dotproduct", |
|
spec=ServerlessSpec( |
|
cloud="aws", |
|
region="us-east-1" |
|
) |
|
) |
|
|
|
index = pc.index("oc-hybrid-index") |
|
|
|
vecs = create_sparse_dense_dict(df) |
|
|
|
index.upsert(vecs, namespace="example-namespace") |
|
|
|
|
|
|
|
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") |
|
pc.delete_index("oc-hybrid-index") |
|
|
|
pc.create_index( |
|
name="oc-hybrid-index", |
|
dimension=1024, |
|
metric="dotproduct", |
|
spec=ServerlessSpec( |
|
cloud="aws", |
|
region="us-east-1" |
|
) |
|
) |
|
|
|
index = pc.Index("oc-hybrid-index") |
|
|
|
|
|
df = pd.read_csv('data/Indicator_Test.csv') |
|
|
|
df = df.head(3) |
|
|
|
|
|
df = pd.read_csv('data/Indicator_Test.csv') |
|
df = df.head(3) |
|
bm25, newdf = create_sparse_embeds(df) |
|
metadata = df[['text', 'label']].to_dict(orient='records') |
|
newdf['metadata'] = metadata |
|
vecs = create_sparse_dense_dict(newdf) |
|
index.upsert(vecs, namespace="example-namespace") |
|
|
|
querytext = "immigrants are invading the border" |
|
queryembed = query_embed(pc, "multilingual-e5-large", querytext) |
|
query_sparse_vector = bm25.encode_documents(querytext) |
|
|
|
query_response = index.query( |
|
top_k=1, |
|
namespace="example-namespace", |
|
vector=queryembed, |
|
sparse_vector=query_sparse_vector, |
|
include_metadata=True |
|
) |
|
|
|
|
|
|
|
from pinecone import Pinecone |
|
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") |
|
model = "multilingual-e5-large" |
|
DataLoader.chunk_and_embed(pc, model, df) |
|
|
|
|
|
df.rename(columns={'Embeddings': 'values'}, inplace=True) |
|
|
|
|
|
df['metadata'] = df.drop(columns=['id', 'values', 'indices', 'sparse_values']).to_dict(orient='records') |
|
|
|
df = df[['id', 'values', 'metadata', 'indices', 'sparse_values']] |
|
|
|
vecs = create_sparse_dense_dict(df) |
|
|
|
pc.create_index( |
|
name="oc-hybrid-indexv2", |
|
dimension=1024, |
|
metric="dotproduct", |
|
spec=ServerlessSpec( |
|
cloud="aws", |
|
region="us-east-1" |
|
) |
|
) |
|
|
|
index = pc.Index("oc-hybrid-indexv2") |
|
index.upsert(vecs, namespace="example-namespace") |
|
|
|
|
|
query_response = index.query( |
|
top_k=10, |
|
vector=[0.1, 0.2, 0.3], |
|
sparse_vector={ |
|
'indices': [10, 45, 16], |
|
'values': [0.5, 0.5, 0.2] |
|
} |
|
) |
|
|
|
|
|
query = "test border patrol" |
|
query_sparse_vector = encode_query(bm25, query) |
|
|
|
query_response = index.query( |
|
top_k=1, |
|
namespace="example-namespace", |
|
vector=np.random.random(1024).tolist(), |
|
sparse_vector=query_sparse_vector |
|
) |
|
|
|
query = "ireland" |
|
query_sparse_vector = encode_query(bm25, query) |
|
|
|
query_response = index.query( |
|
top_k=1, |
|
namespace="example-namespace", |
|
vector=np.random.random(1024).tolist(), |
|
sparse_vector=query_sparse_vector |
|
) |
|
|
|
|
|
|
|
from tqdm.auto import tqdm |
|
|
|
|
|
df = pd.read_csv('data/Indicator_Test.csv') |
|
metadata = df |
|
|
|
batch_size = 200 |
|
|
|
|
|
metadata = metadata.astype(str) |
|
|
|
|
|
|
|
for i in tqdm(range(0, len(df), batch_size)): |
|
|
|
i_end = min(i+batch_size, len(df)) |
|
|
|
meta_batch = metadata.iloc[i:i_end] |
|
meta_dict = meta_batch.to_dict(orient="records") |
|
|
|
meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()] |
|
|
|
img_batch = images[i:i_end] |
|
|
|
sparse_embeds = bm25.encode_documents([text for text in meta_batch]) |
|
|
|
dense_embeds = model.encode(img_batch).tolist() |
|
|
|
ids = [str(x) for x in range(i, i_end)] |
|
|
|
upserts = [] |
|
|
|
for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict): |
|
upserts.append({ |
|
'id': _id, |
|
'sparse_values': sparse, |
|
'values': dense, |
|
'metadata': meta |
|
}) |
|
|
|
index.upsert(upserts) |
|
|
|
|
|
def upsert_hybrid_vectors(index, df, model, bm25, batch_size=200, cols_to_remove=['id', 'year']): |
|
metadata = df.remove_columns("image") |
|
|
|
for i in tqdm(range(0, len(df), batch_size)): |
|
i_end = min(i+batch_size, len(df)) |
|
meta_batch = metadata.iloc[i:i_end] |
|
meta_dict = meta_batch.to_dict(orient="records") |
|
meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()] |
|
text_batch = df |
|
sparse_embeds = bm25.encode_documents([text for text in meta_batch]) |
|
dense_embeds = model.encode(text_batch).tolist() |
|
ids = [str(x) for x in range(i, i_end)] |
|
|
|
upserts = [] |
|
for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict): |
|
upserts.append({ |
|
'id': _id, |
|
'sparse_values': sparse, |
|
'values': dense, |
|
'metadata': meta |
|
}) |
|
index.upsert(upserts) |
|
|
|
|
|
index.describe_index_stats() |