## Upload Telegram 300K to hybrid-sparse from pinecone.grpc import PineconeGRPC as Pinecone import os import pandas as pd import numpy as np from pinecone import ServerlessSpec from pinecone_text.sparse import BM25Encoder import sys sys.path.append('src/python') import DataLoader pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") pc.delete_index("oc-hybrid-300k-index") pc.create_index( name="oc-hybrid-300k-index", dimension=1024, metric="dotproduct", spec=ServerlessSpec( cloud="aws", region="us-east-1" ) ) ## Upsert Indicator Test Data df = pd.read_csv('data/random_300k.csv') ## get top three rows #df = df.head(3) # get text and MessageID bm25, newdf = DataLoader.create_sparse_embeds(pc, df) metadata = df[['text']].to_dict(orient='records') newdf.loc[:, 'metadata'] = metadata ## Taka look at rows where sparse values is an empty array sparse_lengths = [len(x) for x in newdf['sparse_values']] # Pull out the values where sparse length is 0 #edf = newdf[pd.Series(sparse_lengths) == 0] ## Drop newdf rows where sparse length is 0 newdf = newdf[pd.Series(sparse_lengths) != 0] vecs = DataLoader.create_sparse_dense_dict(newdf) index = pc.Index("oc-hybrid-300k-index") # Batch upsert the vectors for i in range(0, len(vecs), 400): end_index = min(i + 400, len(vecs)) index.upsert(vecs[i:end_index], namespace="telegram-300k") ################# Querying the index df = pd.read_csv('data/random_300k.csv') corpus = df['text'].tolist() vector, bm25 = DataLoader.encode_documents(corpus) index = pc.Index("oc-hybrid-300k-index") querytext = "satanic" queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext) query_sparse_vector = bm25.encode_documents(querytext) query_response = index.query( top_k=5, namespace="telegram-300k", vector=queryembed, sparse_vector=query_sparse_vector, include_metadata=True ) query_response