File size: 3,085 Bytes
48bb68b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
## SCRIPT TO UPDATE THE FACT CHECK DATA
#######################################
from pinecone.grpc import PineconeGRPC as Pinecone
import os
import pandas as pd
import numpy as np
from pinecone import ServerlessSpec
from pinecone_text.sparse import BM25Encoder
import sys
sys.path.append('src/python')
import DataLoader
pc = Pinecone(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0", pool_threads=50) # <-- make sure to set this)
##############################
df = pd.read_csv('data/fact_check_latest.csv')
# Drop non-unique text values
df = df.drop_duplicates(subset=['text'])
# skip rows where text is NaN
df = df.dropna(subset=['text'])
## for 'claimReviewTitle' and 'claimReviewUrl' columns, fill NaN with empty string
df['claimReviewUrl'] = df['claimReviewUrl'].fillna('')
# now, check for NaN values in 'claimReviewUrl' column
## get top three rows
# get text and MessageID
bm25, newdf = DataLoader.create_sparse_embeds(pc, df)
#metadata = df[['text', 'category', 'claimReviewTitle', 'claimReviewUrl']].to_dict(orient='records')
metadata = df[['text', 'claimReviewUrl']].to_dict(orient='records')
newdf.loc[:, 'metadata'] = metadata
## Taka look at rows where sparse values is an empty array
sparse_lengths = [len(x) for x in newdf['sparse_values']]
## Drop newdf rows where sparse length is
newdf = newdf[np.array(sparse_lengths) != 0].reset_index(drop=True)
vecs = DataLoader.create_sparse_dense_dict(newdf)
index = pc.Index("oc-hybrid-library-index")
for i in range(0, len(vecs), 400):
end_index = min(i + 400, len(vecs))
index.upsert(vecs[i:end_index], namespace="expanded-fact-checks")
print(f"Upserted vectors")
#####################################
### Querying performance for TruthSeeker Subset
df = pd.read_csv('data/truthseeker_subsample.csv')
corpus = df['claim'].tolist()
"""
## Function query, return score, title, link
Example: get_score_title_link(corpus[0], pc, index)
"""
def get_score_title_link(querytext, pc, index):
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext)
empty_sparse = DataLoader.empty_sparse_vector()
res = index.query(
top_k=1,
namespace="expanded-fact-checks",
vector=queryembed,
sparse_vector=empty_sparse,
include_metadata=True
)
score = res['matches'][0]['score']
title = res['matches'][0]['metadata']['text']
link = res['matches'][0]['metadata']['claimReviewUrl']
return pd.Series([score, title, link], index=['score', 'title', 'link'])
## Get score, title, link for each querytext in corpus
import time
from pinecone.grpc import PineconeGRPC
pc = PineconeGRPC(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0") # <-- make sure to set this)
index = pc.Index(
name="oc-hybrid-library-index",
pool_threads=50, # <-- make sure to set this
)
### TIMING
start_time = time.time()
df[['score', 'title', 'link']] = df['claim'].apply(get_score_title_link, args=(pc, index)) #send the claim column to be scored.
elapsed_time = time.time() - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")
######## END TIMING
|