## SCRIPT TO UPDATE THE FACT CHECK DATA ####################################### from pinecone.grpc import PineconeGRPC as Pinecone import os import pandas as pd import numpy as np from pinecone import ServerlessSpec from pinecone_text.sparse import BM25Encoder import sys sys.path.append('src/python') import DataLoader pc = Pinecone(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0", pool_threads=50) # <-- make sure to set this) ############################## df = pd.read_csv('data/fact_check_latest.csv') # Drop non-unique text values df = df.drop_duplicates(subset=['text']) # skip rows where text is NaN df = df.dropna(subset=['text']) ## for 'claimReviewTitle' and 'claimReviewUrl' columns, fill NaN with empty string df['claimReviewUrl'] = df['claimReviewUrl'].fillna('') # now, check for NaN values in 'claimReviewUrl' column ## get top three rows # get text and MessageID bm25, newdf = DataLoader.create_sparse_embeds(pc, df) #metadata = df[['text', 'category', 'claimReviewTitle', 'claimReviewUrl']].to_dict(orient='records') metadata = df[['text', 'claimReviewUrl']].to_dict(orient='records') newdf.loc[:, 'metadata'] = metadata ## Taka look at rows where sparse values is an empty array sparse_lengths = [len(x) for x in newdf['sparse_values']] ## Drop newdf rows where sparse length is newdf = newdf[np.array(sparse_lengths) != 0].reset_index(drop=True) vecs = DataLoader.create_sparse_dense_dict(newdf) index = pc.Index("oc-hybrid-library-index") for i in range(0, len(vecs), 400): end_index = min(i + 400, len(vecs)) index.upsert(vecs[i:end_index], namespace="expanded-fact-checks") print(f"Upserted vectors") ##################################### ### Querying performance for TruthSeeker Subset df = pd.read_csv('data/truthseeker_subsample.csv') corpus = df['claim'].tolist() """ ## Function query, return score, title, link Example: get_score_title_link(corpus[0], pc, index) """ def get_score_title_link(querytext, pc, index): queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext) empty_sparse = DataLoader.empty_sparse_vector() res = index.query( top_k=1, namespace="expanded-fact-checks", vector=queryembed, sparse_vector=empty_sparse, include_metadata=True ) score = res['matches'][0]['score'] title = res['matches'][0]['metadata']['text'] link = res['matches'][0]['metadata']['claimReviewUrl'] return pd.Series([score, title, link], index=['score', 'title', 'link']) ## Get score, title, link for each querytext in corpus import time from pinecone.grpc import PineconeGRPC pc = PineconeGRPC(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0") # <-- make sure to set this) index = pc.Index( name="oc-hybrid-library-index", pool_threads=50, # <-- make sure to set this ) ### TIMING start_time = time.time() df[['score', 'title', 'link']] = df['claim'].apply(get_score_title_link, args=(pc, index)) #send the claim column to be scored. elapsed_time = time.time() - start_time print(f"Time taken: {elapsed_time:.2f} seconds") ######## END TIMING