## Upload Telegram 300K to hybrid-sparse from pinecone.grpc import PineconeGRPC as Pinecone import os import pandas as pd import numpy as np from pinecone import ServerlessSpec from pinecone_text.sparse import BM25Encoder import sys sys.path.append('src/python') import DataLoader pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") pc.delete_index("oc-hybrid-library-index") pc.create_index( name="oc-hybrid-library-index", dimension=1024, metric="dotproduct", spec=ServerlessSpec( cloud="aws", region="us-east-1" ) ) ## Upsert Indicator Data df = pd.read_csv('data/google_fact_checks2024-11-14.csv') # Drop non-unique text values df = df.drop_duplicates(subset=['text']) ## get top three rows #df = df.head(3) # get text and MessageID bm25, newdf = DataLoader.create_sparse_embeds(pc, df) metadata = df[['text', 'category', 'claimReviewTitle', 'claimReviewUrl']].to_dict(orient='records') newdf.loc[:, 'metadata'] = metadata ## Taka look at rows where sparse values is an empty array sparse_lengths = [len(x) for x in newdf['sparse_values']] ## Drop newdf rows where sparse length is #newdf = newdf[pd.Series(sparse_lengths) != 0] # Create a dictionary of sparse and dense vectors for each category value in the dataframe #for category in df['category'].unique(): # category_df = newdf[df['category'] == category] # vecs = DataLoader.create_sparse_dense_dict(category_df) # index = pc.Index("oc-hybrid-library-index") # for i in range(0, len(vecs), 400): # end_index = min(i + 400, len(vecs)) # index.upsert(vecs[i:end_index], namespace=category) # print(f"Upserted {category} vectors") vecs = DataLoader.create_sparse_dense_dict(newdf) index = pc.Index("oc-hybrid-library-index") for i in range(0, len(vecs), 400): end_index = min(i + 400, len(vecs)) index.upsert(vecs[i:end_index], namespace="fact-checks") print(f"Upserted vectors") ################# Querying the index df = pd.read_csv('data/google_fact_checks2024-11-14.csv') corpus = df['text'].tolist() vector, bm25 = DataLoader.encode_documents(corpus) index = pc.Index("oc-hybrid-library-index") querytext = "satanic" queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext) query_sparse_vector = bm25.encode_documents(querytext) empty_sparse = empty_sparse_vector() query_response = index.query( top_k=5, namespace="immigration", vector=queryembed, sparse_vector=empty_sparse, include_metadata=True ) query_response ## UPLOAD Expansive LLM's df = pd.read_csv('data/expansive_claims_library_expanded.csv') df['text']=df['ExpandedClaim'] ## get top three rows #df = df.head(3) # get text and MessageID bm25, newdf = DataLoader.create_sparse_embeds(pc, df) metadata = df[['Narrative', 'Model', 'Policy']].to_dict(orient='records') newdf.loc[:, 'metadata'] = metadata ## Taka look at rows where sparse values is an empty array sparse_lengths = [len(x) for x in newdf['sparse_values']] ## Drop newdf rows where sparse length is 0 newdf = newdf[pd.Series(sparse_lengths) != 0] # Create a dictionary of sparse and dense vectors for each category value in the dataframe #for category in df['category'].unique(): # category_df = newdf[df['category'] == category] # vecs = DataLoader.create_sparse_dense_dict(category_df) # index = pc.Index("oc-hybrid-library-index") # for i in range(0, len(vecs), 400): # end_index = min(i + 400, len(vecs)) # index.upsert(vecs[i:end_index], namespace=category) # print(f"Upserted {category} vectors") vecs = DataLoader.create_sparse_dense_dict(newdf) index = pc.Index("oc-hybrid-library-index") for i in range(0, len(vecs), 400): end_index = min(i + 400, len(vecs)) index.upsert(vecs[i:end_index], namespace="narratives") print(f"Upserted vectors")