Spaces:
Sleeping
Sleeping
## Upload Telegram 300K to hybrid-sparse | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
import os | |
import pandas as pd | |
import numpy as np | |
from pinecone import ServerlessSpec | |
from pinecone_text.sparse import BM25Encoder | |
import sys | |
sys.path.append('src/python') | |
import DataLoader | |
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") | |
pc.delete_index("oc-hybrid-library-index") | |
pc.create_index( | |
name="oc-hybrid-library-index", | |
dimension=1024, | |
metric="dotproduct", | |
spec=ServerlessSpec( | |
cloud="aws", | |
region="us-east-1" | |
) | |
) | |
## Upsert Indicator Data | |
df = pd.read_csv('data/google_fact_checks2024-11-14.csv') | |
# Drop non-unique text values | |
df = df.drop_duplicates(subset=['text']) | |
## get top three rows | |
#df = df.head(3) | |
# get text and MessageID | |
bm25, newdf = DataLoader.create_sparse_embeds(pc, df) | |
metadata = df[['text', 'category', 'claimReviewTitle', 'claimReviewUrl']].to_dict(orient='records') | |
newdf.loc[:, 'metadata'] = metadata | |
## Taka look at rows where sparse values is an empty array | |
sparse_lengths = [len(x) for x in newdf['sparse_values']] | |
## Drop newdf rows where sparse length is | |
#newdf = newdf[pd.Series(sparse_lengths) != 0] | |
# Create a dictionary of sparse and dense vectors for each category value in the dataframe | |
#for category in df['category'].unique(): | |
# category_df = newdf[df['category'] == category] | |
# vecs = DataLoader.create_sparse_dense_dict(category_df) | |
# index = pc.Index("oc-hybrid-library-index") | |
# for i in range(0, len(vecs), 400): | |
# end_index = min(i + 400, len(vecs)) | |
# index.upsert(vecs[i:end_index], namespace=category) | |
# print(f"Upserted {category} vectors") | |
vecs = DataLoader.create_sparse_dense_dict(newdf) | |
index = pc.Index("oc-hybrid-library-index") | |
for i in range(0, len(vecs), 400): | |
end_index = min(i + 400, len(vecs)) | |
index.upsert(vecs[i:end_index], namespace="fact-checks") | |
print(f"Upserted vectors") | |
################# Querying the index | |
df = pd.read_csv('data/google_fact_checks2024-11-14.csv') | |
corpus = df['text'].tolist() | |
vector, bm25 = DataLoader.encode_documents(corpus) | |
index = pc.Index("oc-hybrid-library-index") | |
querytext = "satanic" | |
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext) | |
query_sparse_vector = bm25.encode_documents(querytext) | |
empty_sparse = empty_sparse_vector() | |
query_response = index.query( | |
top_k=5, | |
namespace="immigration", | |
vector=queryembed, | |
sparse_vector=empty_sparse, | |
include_metadata=True | |
) | |
query_response | |
## UPLOAD Expansive LLM's | |
df = pd.read_csv('data/expansive_claims_library_expanded.csv') | |
df['text']=df['ExpandedClaim'] | |
## get top three rows | |
#df = df.head(3) | |
# get text and MessageID | |
bm25, newdf = DataLoader.create_sparse_embeds(pc, df) | |
metadata = df[['Narrative', 'Model', 'Policy']].to_dict(orient='records') | |
newdf.loc[:, 'metadata'] = metadata | |
## Taka look at rows where sparse values is an empty array | |
sparse_lengths = [len(x) for x in newdf['sparse_values']] | |
## Drop newdf rows where sparse length is 0 | |
newdf = newdf[pd.Series(sparse_lengths) != 0] | |
# Create a dictionary of sparse and dense vectors for each category value in the dataframe | |
#for category in df['category'].unique(): | |
# category_df = newdf[df['category'] == category] | |
# vecs = DataLoader.create_sparse_dense_dict(category_df) | |
# index = pc.Index("oc-hybrid-library-index") | |
# for i in range(0, len(vecs), 400): | |
# end_index = min(i + 400, len(vecs)) | |
# index.upsert(vecs[i:end_index], namespace=category) | |
# print(f"Upserted {category} vectors") | |
vecs = DataLoader.create_sparse_dense_dict(newdf) | |
index = pc.Index("oc-hybrid-library-index") | |
for i in range(0, len(vecs), 400): | |
end_index = min(i + 400, len(vecs)) | |
index.upsert(vecs[i:end_index], namespace="narratives") | |
print(f"Upserted vectors") | |