misinfo_detection_app / notebooks /hybrid_example.py
stefanjwojcik's picture
Upload 5 files
734777a verified
### Example code
from pinecone.grpc import PineconeGRPC as Pinecone
import os
import pandas as pd
import numpy as np
from pinecone import ServerlessSpec
from pinecone_text.sparse import BM25Encoder
import sys
sys.path.append('src/python')
import DataLoader
####### VERY MINIMAL NONSENSE DATA
data = {
'id': ['vec1', 'vec2'],
'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]],
'metadata': [{'text': 'drama'}, {'text': 'action'}],
'sparse_indices': [[10, 45, 16], [12, 34, 56]],
'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]]
}
pc.create_index(
name="oc-hybrid-index",
dimension=3,
metric="dotproduct",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
index = pc.index("oc-hybrid-index")
vecs = create_sparse_dense_dict(df)
index.upsert(vecs, namespace="example-namespace")
######################## Indicator Test Data
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
pc.delete_index("oc-hybrid-index")
pc.create_index(
name="oc-hybrid-index",
dimension=1024,
metric="dotproduct",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
index = pc.Index("oc-hybrid-index")
## Upsert Indicator Test Data
df = pd.read_csv('data/Indicator_Test.csv')
## get top three rows
df = df.head(3)
# get text and MessageID
# Example usage
df = pd.read_csv('data/Indicator_Test.csv')
df = df.head(3)
bm25, newdf = create_sparse_embeds(df)
metadata = df[['text', 'label']].to_dict(orient='records')
newdf['metadata'] = metadata
vecs = create_sparse_dense_dict(newdf)
index.upsert(vecs, namespace="example-namespace")
## Query the hybrid index
querytext = "immigrants are invading the border"
queryembed = query_embed(pc, "multilingual-e5-large", querytext)
query_sparse_vector = bm25.encode_documents(querytext)
query_response = index.query(
top_k=1,
namespace="example-namespace",
vector=queryembed,
sparse_vector=query_sparse_vector,
include_metadata=True
)
## Now create embeddings
from pinecone import Pinecone
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
model = "multilingual-e5-large"
DataLoader.chunk_and_embed(pc, model, df)
#df['Embeddings'] = [np.random.random(4) for x in range(len(df))]
# rename embeddings to values
df.rename(columns={'Embeddings': 'values'}, inplace=True)
#df['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(df))]
## now, create metadata column to capture any column not including id, values, indices, and sparse_values
df['metadata'] = df.drop(columns=['id', 'values', 'indices', 'sparse_values']).to_dict(orient='records')
# only keep ids, values, metadata, indices, and sparse_values
df = df[['id', 'values', 'metadata', 'indices', 'sparse_values']]
vecs = create_sparse_dense_dict(df)
pc.create_index(
name="oc-hybrid-indexv2",
dimension=1024,
metric="dotproduct",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
index = pc.Index("oc-hybrid-indexv2")
index.upsert(vecs, namespace="example-namespace")
## QUERY
query_response = index.query(
top_k=10,
vector=[0.1, 0.2, 0.3],
sparse_vector={
'indices': [10, 45, 16],
'values': [0.5, 0.5, 0.2]
}
)
################
query = "test border patrol"
query_sparse_vector = encode_query(bm25, query)
query_response = index.query(
top_k=1,
namespace="example-namespace",
vector=np.random.random(1024).tolist(),
sparse_vector=query_sparse_vector
)
################
query = "ireland"
query_sparse_vector = encode_query(bm25, query)
query_response = index.query(
top_k=1,
namespace="example-namespace",
vector=np.random.random(1024).tolist(),
sparse_vector=query_sparse_vector
)
################## Function to create sparse and dense vectors
from tqdm.auto import tqdm
# Remove columns you dont want to encode
df = pd.read_csv('data/Indicator_Test.csv')
metadata = df
batch_size = 200
# convert all columns to string
metadata = metadata.astype(str)
#cols_to_remove = ['channelID', 'MessageID', 'AccountID', 'label', 'contexts', 'topics', 'weak topics', 'indicators']
for i in tqdm(range(0, len(df), batch_size)):
# find end of batch
i_end = min(i+batch_size, len(df))
# extract metadata batch
meta_batch = metadata.iloc[i:i_end]
meta_dict = meta_batch.to_dict(orient="records")
# concatenate all metadata field except for id and year to form a single string
meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()]
# extract image batch
img_batch = images[i:i_end]
# create sparse BM25 vectors
sparse_embeds = bm25.encode_documents([text for text in meta_batch])
# create dense vectors
dense_embeds = model.encode(img_batch).tolist()
# create unique IDs
ids = [str(x) for x in range(i, i_end)]
upserts = []
# loop through the data and create dictionaries for uploading documents to pinecone index
for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict):
upserts.append({
'id': _id,
'sparse_values': sparse,
'values': dense,
'metadata': meta
})
# upload the documents to the new hybrid index
index.upsert(upserts)
# Create an upsert function for hybrid vectors
def upsert_hybrid_vectors(index, df, model, bm25, batch_size=200, cols_to_remove=['id', 'year']):
metadata = df.remove_columns("image")
for i in tqdm(range(0, len(df), batch_size)):
i_end = min(i+batch_size, len(df))
meta_batch = metadata.iloc[i:i_end]
meta_dict = meta_batch.to_dict(orient="records")
meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()]
text_batch = df
sparse_embeds = bm25.encode_documents([text for text in meta_batch])
dense_embeds = model.encode(text_batch).tolist()
ids = [str(x) for x in range(i, i_end)]
upserts = []
for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict):
upserts.append({
'id': _id,
'sparse_values': sparse,
'values': dense,
'metadata': meta
})
index.upsert(upserts)
# show index description after uploading the documents
index.describe_index_stats()