Spaces:
Sleeping
Sleeping
## SCRIPT TO UPDATE THE FACT CHECK DATA | |
####################################### | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
import os | |
import pandas as pd | |
import numpy as np | |
from pinecone import ServerlessSpec | |
from pinecone_text.sparse import BM25Encoder | |
import sys | |
sys.path.append('src/python') | |
import DataLoader | |
pc = Pinecone(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0", pool_threads=50) # <-- make sure to set this) | |
############################## | |
df = pd.read_csv('data/fact_check_latest.csv') | |
# Drop non-unique text values | |
df = df.drop_duplicates(subset=['text']) | |
# skip rows where text is NaN | |
df = df.dropna(subset=['text']) | |
## for 'claimReviewTitle' and 'claimReviewUrl' columns, fill NaN with empty string | |
df['claimReviewUrl'] = df['claimReviewUrl'].fillna('') | |
# now, check for NaN values in 'claimReviewUrl' column | |
## get top three rows | |
# get text and MessageID | |
bm25, newdf = DataLoader.create_sparse_embeds(pc, df) | |
#metadata = df[['text', 'category', 'claimReviewTitle', 'claimReviewUrl']].to_dict(orient='records') | |
metadata = df[['text', 'claimReviewUrl']].to_dict(orient='records') | |
newdf.loc[:, 'metadata'] = metadata | |
## Taka look at rows where sparse values is an empty array | |
sparse_lengths = [len(x) for x in newdf['sparse_values']] | |
## Drop newdf rows where sparse length is | |
newdf = newdf[np.array(sparse_lengths) != 0].reset_index(drop=True) | |
vecs = DataLoader.create_sparse_dense_dict(newdf) | |
index = pc.Index("oc-hybrid-library-index") | |
for i in range(0, len(vecs), 400): | |
end_index = min(i + 400, len(vecs)) | |
index.upsert(vecs[i:end_index], namespace="expanded-fact-checks") | |
print(f"Upserted vectors") | |
##################################### | |
### Querying performance for TruthSeeker Subset | |
df = pd.read_csv('data/truthseeker_subsample.csv') | |
corpus = df['claim'].tolist() | |
""" | |
## Function query, return score, title, link | |
Example: get_score_title_link(corpus[0], pc, index) | |
""" | |
def get_score_title_link(querytext, pc, index): | |
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext) | |
empty_sparse = DataLoader.empty_sparse_vector() | |
res = index.query( | |
top_k=1, | |
namespace="expanded-fact-checks", | |
vector=queryembed, | |
sparse_vector=empty_sparse, | |
include_metadata=True | |
) | |
score = res['matches'][0]['score'] | |
title = res['matches'][0]['metadata']['text'] | |
link = res['matches'][0]['metadata']['claimReviewUrl'] | |
return pd.Series([score, title, link], index=['score', 'title', 'link']) | |
## Get score, title, link for each querytext in corpus | |
import time | |
from pinecone.grpc import PineconeGRPC | |
pc = PineconeGRPC(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0") # <-- make sure to set this) | |
index = pc.Index( | |
name="oc-hybrid-library-index", | |
pool_threads=50, # <-- make sure to set this | |
) | |
### TIMING | |
start_time = time.time() | |
df[['score', 'title', 'link']] = df['claim'].apply(get_score_title_link, args=(pc, index)) #send the claim column to be scored. | |
elapsed_time = time.time() - start_time | |
print(f"Time taken: {elapsed_time:.2f} seconds") | |
######## END TIMING | |