stefanjwojcik's picture
Upload 5 files
734777a verified
## Upload Telegram 300K to hybrid-sparse
from pinecone.grpc import PineconeGRPC as Pinecone
import os
import pandas as pd
import numpy as np
from pinecone import ServerlessSpec
from pinecone_text.sparse import BM25Encoder
import sys
sys.path.append('src/python')
import DataLoader
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
pc.delete_index("oc-hybrid-300k-index")
pc.create_index(
name="oc-hybrid-300k-index",
dimension=1024,
metric="dotproduct",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
## Upsert Indicator Test Data
df = pd.read_csv('data/random_300k.csv')
## get top three rows
#df = df.head(3)
# get text and MessageID
bm25, newdf = DataLoader.create_sparse_embeds(pc, df)
metadata = df[['text']].to_dict(orient='records')
newdf.loc[:, 'metadata'] = metadata
## Taka look at rows where sparse values is an empty array
sparse_lengths = [len(x) for x in newdf['sparse_values']]
# Pull out the values where sparse length is 0
#edf = newdf[pd.Series(sparse_lengths) == 0]
## Drop newdf rows where sparse length is 0
newdf = newdf[pd.Series(sparse_lengths) != 0]
vecs = DataLoader.create_sparse_dense_dict(newdf)
index = pc.Index("oc-hybrid-300k-index")
# Batch upsert the vectors
for i in range(0, len(vecs), 400):
end_index = min(i + 400, len(vecs))
index.upsert(vecs[i:end_index], namespace="telegram-300k")
################# Querying the index
df = pd.read_csv('data/random_300k.csv')
corpus = df['text'].tolist()
vector, bm25 = DataLoader.encode_documents(corpus)
index = pc.Index("oc-hybrid-300k-index")
querytext = "satanic"
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext)
query_sparse_vector = bm25.encode_documents(querytext)
query_response = index.query(
top_k=5,
namespace="telegram-300k",
vector=queryembed,
sparse_vector=query_sparse_vector,
include_metadata=True
)
query_response