misinfo_detection_app / src /deprecated /upload_library_hybrid-sparse.py
=
Add new embeddings and update data processing scripts; remove MiniEncoder
05a2a0c
## Upload Telegram 300K to hybrid-sparse
from pinecone.grpc import PineconeGRPC as Pinecone
import os
import pandas as pd
import numpy as np
from pinecone import ServerlessSpec
from pinecone_text.sparse import BM25Encoder
import sys
sys.path.append('src/python')
import DataLoader
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
pc.delete_index("oc-hybrid-library-index")
pc.create_index(
name="oc-hybrid-library-index",
dimension=1024,
metric="dotproduct",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
## Upsert Indicator Data
df = pd.read_csv('data/google_fact_checks2024-11-14.csv')
# Drop non-unique text values
df = df.drop_duplicates(subset=['text'])
## get top three rows
#df = df.head(3)
# get text and MessageID
bm25, newdf = DataLoader.create_sparse_embeds(pc, df)
metadata = df[['text', 'category', 'claimReviewTitle', 'claimReviewUrl']].to_dict(orient='records')
newdf.loc[:, 'metadata'] = metadata
## Taka look at rows where sparse values is an empty array
sparse_lengths = [len(x) for x in newdf['sparse_values']]
## Drop newdf rows where sparse length is
#newdf = newdf[pd.Series(sparse_lengths) != 0]
# Create a dictionary of sparse and dense vectors for each category value in the dataframe
#for category in df['category'].unique():
# category_df = newdf[df['category'] == category]
# vecs = DataLoader.create_sparse_dense_dict(category_df)
# index = pc.Index("oc-hybrid-library-index")
# for i in range(0, len(vecs), 400):
# end_index = min(i + 400, len(vecs))
# index.upsert(vecs[i:end_index], namespace=category)
# print(f"Upserted {category} vectors")
vecs = DataLoader.create_sparse_dense_dict(newdf)
index = pc.Index("oc-hybrid-library-index")
for i in range(0, len(vecs), 400):
end_index = min(i + 400, len(vecs))
index.upsert(vecs[i:end_index], namespace="fact-checks")
print(f"Upserted vectors")
################# Querying the index
df = pd.read_csv('data/google_fact_checks2024-11-14.csv')
corpus = df['text'].tolist()
vector, bm25 = DataLoader.encode_documents(corpus)
index = pc.Index("oc-hybrid-library-index")
querytext = "satanic"
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext)
query_sparse_vector = bm25.encode_documents(querytext)
empty_sparse = empty_sparse_vector()
query_response = index.query(
top_k=5,
namespace="immigration",
vector=queryembed,
sparse_vector=empty_sparse,
include_metadata=True
)
query_response
## UPLOAD Expansive LLM's
df = pd.read_csv('data/expansive_claims_library_expanded.csv')
df['text']=df['ExpandedClaim']
## get top three rows
#df = df.head(3)
# get text and MessageID
bm25, newdf = DataLoader.create_sparse_embeds(pc, df)
metadata = df[['Narrative', 'Model', 'Policy']].to_dict(orient='records')
newdf.loc[:, 'metadata'] = metadata
## Taka look at rows where sparse values is an empty array
sparse_lengths = [len(x) for x in newdf['sparse_values']]
## Drop newdf rows where sparse length is 0
newdf = newdf[pd.Series(sparse_lengths) != 0]
# Create a dictionary of sparse and dense vectors for each category value in the dataframe
#for category in df['category'].unique():
# category_df = newdf[df['category'] == category]
# vecs = DataLoader.create_sparse_dense_dict(category_df)
# index = pc.Index("oc-hybrid-library-index")
# for i in range(0, len(vecs), 400):
# end_index = min(i + 400, len(vecs))
# index.upsert(vecs[i:end_index], namespace=category)
# print(f"Upserted {category} vectors")
vecs = DataLoader.create_sparse_dense_dict(newdf)
index = pc.Index("oc-hybrid-library-index")
for i in range(0, len(vecs), 400):
end_index = min(i + 400, len(vecs))
index.upsert(vecs[i:end_index], namespace="narratives")
print(f"Upserted vectors")