Spaces:
Sleeping
Sleeping
## Upload Telegram 300K to hybrid-sparse | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
import os | |
import pandas as pd | |
import numpy as np | |
from pinecone import ServerlessSpec | |
from pinecone_text.sparse import BM25Encoder | |
import sys | |
sys.path.append('src/python') | |
import DataLoader | |
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") | |
pc.delete_index("oc-hybrid-300k-index") | |
pc.create_index( | |
name="oc-hybrid-300k-index", | |
dimension=1024, | |
metric="dotproduct", | |
spec=ServerlessSpec( | |
cloud="aws", | |
region="us-east-1" | |
) | |
) | |
## Upsert Indicator Test Data | |
df = pd.read_csv('data/random_300k.csv') | |
## get top three rows | |
#df = df.head(3) | |
# get text and MessageID | |
bm25, newdf = DataLoader.create_sparse_embeds(pc, df) | |
metadata = df[['text']].to_dict(orient='records') | |
newdf.loc[:, 'metadata'] = metadata | |
## Taka look at rows where sparse values is an empty array | |
sparse_lengths = [len(x) for x in newdf['sparse_values']] | |
# Pull out the values where sparse length is 0 | |
#edf = newdf[pd.Series(sparse_lengths) == 0] | |
## Drop newdf rows where sparse length is 0 | |
newdf = newdf[pd.Series(sparse_lengths) != 0] | |
vecs = DataLoader.create_sparse_dense_dict(newdf) | |
index = pc.Index("oc-hybrid-300k-index") | |
# Batch upsert the vectors | |
for i in range(0, len(vecs), 400): | |
end_index = min(i + 400, len(vecs)) | |
index.upsert(vecs[i:end_index], namespace="telegram-300k") | |
################# Querying the index | |
df = pd.read_csv('data/random_300k.csv') | |
corpus = df['text'].tolist() | |
vector, bm25 = DataLoader.encode_documents(corpus) | |
index = pc.Index("oc-hybrid-300k-index") | |
querytext = "satanic" | |
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext) | |
query_sparse_vector = bm25.encode_documents(querytext) | |
query_response = index.query( | |
top_k=5, | |
namespace="telegram-300k", | |
vector=queryembed, | |
sparse_vector=query_sparse_vector, | |
include_metadata=True | |
) | |
query_response |