File size: 1,906 Bytes
734777a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
## Upload Telegram 300K to hybrid-sparse
from pinecone.grpc import PineconeGRPC as Pinecone
import os
import pandas as pd
import numpy as np
from pinecone import ServerlessSpec
from pinecone_text.sparse import BM25Encoder
import sys
sys.path.append('src/python')
import DataLoader
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
pc.delete_index("oc-hybrid-300k-index")
pc.create_index(
name="oc-hybrid-300k-index",
dimension=1024,
metric="dotproduct",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
## Upsert Indicator Test Data
df = pd.read_csv('data/random_300k.csv')
## get top three rows
#df = df.head(3)
# get text and MessageID
bm25, newdf = DataLoader.create_sparse_embeds(pc, df)
metadata = df[['text']].to_dict(orient='records')
newdf.loc[:, 'metadata'] = metadata
## Taka look at rows where sparse values is an empty array
sparse_lengths = [len(x) for x in newdf['sparse_values']]
# Pull out the values where sparse length is 0
#edf = newdf[pd.Series(sparse_lengths) == 0]
## Drop newdf rows where sparse length is 0
newdf = newdf[pd.Series(sparse_lengths) != 0]
vecs = DataLoader.create_sparse_dense_dict(newdf)
index = pc.Index("oc-hybrid-300k-index")
# Batch upsert the vectors
for i in range(0, len(vecs), 400):
end_index = min(i + 400, len(vecs))
index.upsert(vecs[i:end_index], namespace="telegram-300k")
################# Querying the index
df = pd.read_csv('data/random_300k.csv')
corpus = df['text'].tolist()
vector, bm25 = DataLoader.encode_documents(corpus)
index = pc.Index("oc-hybrid-300k-index")
querytext = "satanic"
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext)
query_sparse_vector = bm25.encode_documents(querytext)
query_response = index.query(
top_k=5,
namespace="telegram-300k",
vector=queryembed,
sparse_vector=query_sparse_vector,
include_metadata=True
)
query_response |