### Example code from pinecone.grpc import PineconeGRPC as Pinecone import os import pandas as pd import numpy as np from pinecone import ServerlessSpec from pinecone_text.sparse import BM25Encoder import sys sys.path.append('src/python') import DataLoader ####### VERY MINIMAL NONSENSE DATA data = { 'id': ['vec1', 'vec2'], 'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]], 'metadata': [{'text': 'drama'}, {'text': 'action'}], 'sparse_indices': [[10, 45, 16], [12, 34, 56]], 'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]] } pc.create_index( name="oc-hybrid-index", dimension=3, metric="dotproduct", spec=ServerlessSpec( cloud="aws", region="us-east-1" ) ) index = pc.index("oc-hybrid-index") vecs = create_sparse_dense_dict(df) index.upsert(vecs, namespace="example-namespace") ######################## Indicator Test Data pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") pc.delete_index("oc-hybrid-index") pc.create_index( name="oc-hybrid-index", dimension=1024, metric="dotproduct", spec=ServerlessSpec( cloud="aws", region="us-east-1" ) ) index = pc.Index("oc-hybrid-index") ## Upsert Indicator Test Data df = pd.read_csv('data/Indicator_Test.csv') ## get top three rows df = df.head(3) # get text and MessageID # Example usage df = pd.read_csv('data/Indicator_Test.csv') df = df.head(3) bm25, newdf = create_sparse_embeds(df) metadata = df[['text', 'label']].to_dict(orient='records') newdf['metadata'] = metadata vecs = create_sparse_dense_dict(newdf) index.upsert(vecs, namespace="example-namespace") ## Query the hybrid index querytext = "immigrants are invading the border" queryembed = query_embed(pc, "multilingual-e5-large", querytext) query_sparse_vector = bm25.encode_documents(querytext) query_response = index.query( top_k=1, namespace="example-namespace", vector=queryembed, sparse_vector=query_sparse_vector, include_metadata=True ) ## Now create embeddings from pinecone import Pinecone pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") model = "multilingual-e5-large" DataLoader.chunk_and_embed(pc, model, df) #df['Embeddings'] = [np.random.random(4) for x in range(len(df))] # rename embeddings to values df.rename(columns={'Embeddings': 'values'}, inplace=True) #df['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(df))] ## now, create metadata column to capture any column not including id, values, indices, and sparse_values df['metadata'] = df.drop(columns=['id', 'values', 'indices', 'sparse_values']).to_dict(orient='records') # only keep ids, values, metadata, indices, and sparse_values df = df[['id', 'values', 'metadata', 'indices', 'sparse_values']] vecs = create_sparse_dense_dict(df) pc.create_index( name="oc-hybrid-indexv2", dimension=1024, metric="dotproduct", spec=ServerlessSpec( cloud="aws", region="us-east-1" ) ) index = pc.Index("oc-hybrid-indexv2") index.upsert(vecs, namespace="example-namespace") ## QUERY query_response = index.query( top_k=10, vector=[0.1, 0.2, 0.3], sparse_vector={ 'indices': [10, 45, 16], 'values': [0.5, 0.5, 0.2] } ) ################ query = "test border patrol" query_sparse_vector = encode_query(bm25, query) query_response = index.query( top_k=1, namespace="example-namespace", vector=np.random.random(1024).tolist(), sparse_vector=query_sparse_vector ) ################ query = "ireland" query_sparse_vector = encode_query(bm25, query) query_response = index.query( top_k=1, namespace="example-namespace", vector=np.random.random(1024).tolist(), sparse_vector=query_sparse_vector ) ################## Function to create sparse and dense vectors from tqdm.auto import tqdm # Remove columns you dont want to encode df = pd.read_csv('data/Indicator_Test.csv') metadata = df batch_size = 200 # convert all columns to string metadata = metadata.astype(str) #cols_to_remove = ['channelID', 'MessageID', 'AccountID', 'label', 'contexts', 'topics', 'weak topics', 'indicators'] for i in tqdm(range(0, len(df), batch_size)): # find end of batch i_end = min(i+batch_size, len(df)) # extract metadata batch meta_batch = metadata.iloc[i:i_end] meta_dict = meta_batch.to_dict(orient="records") # concatenate all metadata field except for id and year to form a single string meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()] # extract image batch img_batch = images[i:i_end] # create sparse BM25 vectors sparse_embeds = bm25.encode_documents([text for text in meta_batch]) # create dense vectors dense_embeds = model.encode(img_batch).tolist() # create unique IDs ids = [str(x) for x in range(i, i_end)] upserts = [] # loop through the data and create dictionaries for uploading documents to pinecone index for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict): upserts.append({ 'id': _id, 'sparse_values': sparse, 'values': dense, 'metadata': meta }) # upload the documents to the new hybrid index index.upsert(upserts) # Create an upsert function for hybrid vectors def upsert_hybrid_vectors(index, df, model, bm25, batch_size=200, cols_to_remove=['id', 'year']): metadata = df.remove_columns("image") for i in tqdm(range(0, len(df), batch_size)): i_end = min(i+batch_size, len(df)) meta_batch = metadata.iloc[i:i_end] meta_dict = meta_batch.to_dict(orient="records") meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()] text_batch = df sparse_embeds = bm25.encode_documents([text for text in meta_batch]) dense_embeds = model.encode(text_batch).tolist() ids = [str(x) for x in range(i, i_end)] upserts = [] for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict): upserts.append({ 'id': _id, 'sparse_values': sparse, 'values': dense, 'metadata': meta }) index.upsert(upserts) # show index description after uploading the documents index.describe_index_stats()