|
|
|
from pinecone.grpc import PineconeGRPC as Pinecone |
|
import os |
|
import pandas as pd |
|
import numpy as np |
|
from pinecone import ServerlessSpec |
|
from pinecone_text.sparse import BM25Encoder |
|
import sys |
|
sys.path.append('src/python') |
|
import DataLoader |
|
|
|
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") |
|
pc.delete_index("oc-hybrid-300k-index") |
|
|
|
pc.create_index( |
|
name="oc-hybrid-300k-index", |
|
dimension=1024, |
|
metric="dotproduct", |
|
spec=ServerlessSpec( |
|
cloud="aws", |
|
region="us-east-1" |
|
) |
|
) |
|
|
|
|
|
df = pd.read_csv('data/random_300k.csv') |
|
|
|
|
|
|
|
bm25, newdf = DataLoader.create_sparse_embeds(pc, df) |
|
metadata = df[['text']].to_dict(orient='records') |
|
newdf.loc[:, 'metadata'] = metadata |
|
|
|
sparse_lengths = [len(x) for x in newdf['sparse_values']] |
|
|
|
|
|
|
|
newdf = newdf[pd.Series(sparse_lengths) != 0] |
|
|
|
vecs = DataLoader.create_sparse_dense_dict(newdf) |
|
index = pc.Index("oc-hybrid-300k-index") |
|
|
|
|
|
for i in range(0, len(vecs), 400): |
|
end_index = min(i + 400, len(vecs)) |
|
index.upsert(vecs[i:end_index], namespace="telegram-300k") |
|
|
|
|
|
|
|
df = pd.read_csv('data/random_300k.csv') |
|
corpus = df['text'].tolist() |
|
vector, bm25 = DataLoader.encode_documents(corpus) |
|
index = pc.Index("oc-hybrid-300k-index") |
|
|
|
querytext = "satanic" |
|
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext) |
|
query_sparse_vector = bm25.encode_documents(querytext) |
|
|
|
query_response = index.query( |
|
top_k=5, |
|
namespace="telegram-300k", |
|
vector=queryembed, |
|
sparse_vector=query_sparse_vector, |
|
include_metadata=True |
|
) |
|
query_response |