File size: 3,085 Bytes
48bb68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
## SCRIPT TO UPDATE THE FACT CHECK DATA 
#######################################
from pinecone.grpc import PineconeGRPC as Pinecone
import os
import pandas as pd
import numpy as np
from pinecone import ServerlessSpec
from pinecone_text.sparse import BM25Encoder
import sys
sys.path.append('src/python')
import DataLoader
pc = Pinecone(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0", pool_threads=50) # <-- make sure to set this)
##############################

df = pd.read_csv('data/fact_check_latest.csv')
# Drop non-unique text values
df = df.drop_duplicates(subset=['text'])
# skip rows where text is NaN
df = df.dropna(subset=['text'])
## for 'claimReviewTitle' and 'claimReviewUrl' columns, fill NaN with empty string
df['claimReviewUrl'] = df['claimReviewUrl'].fillna('')
# now, check for NaN values in 'claimReviewUrl' column
## get top three rows 
# get text and MessageID
bm25, newdf = DataLoader.create_sparse_embeds(pc, df)
#metadata = df[['text', 'category', 'claimReviewTitle', 'claimReviewUrl']].to_dict(orient='records')
metadata = df[['text', 'claimReviewUrl']].to_dict(orient='records')
newdf.loc[:, 'metadata'] = metadata

## Taka look at rows where sparse values is an empty array
sparse_lengths = [len(x) for x in newdf['sparse_values']]
## Drop newdf rows where sparse length is
newdf = newdf[np.array(sparse_lengths) != 0].reset_index(drop=True)
vecs = DataLoader.create_sparse_dense_dict(newdf)
index = pc.Index("oc-hybrid-library-index")
for i in range(0, len(vecs), 400):
    end_index = min(i + 400, len(vecs))
    index.upsert(vecs[i:end_index], namespace="expanded-fact-checks")
    print(f"Upserted vectors")

#####################################
### Querying performance for TruthSeeker Subset
df = pd.read_csv('data/truthseeker_subsample.csv')
corpus = df['claim'].tolist()

"""
## Function query, return score, title, link
Example: get_score_title_link(corpus[0], pc, index)
"""
def get_score_title_link(querytext, pc, index):
    queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext)
    empty_sparse = DataLoader.empty_sparse_vector()
    res = index.query(
        top_k=1,
        namespace="expanded-fact-checks",
        vector=queryembed,
        sparse_vector=empty_sparse,
        include_metadata=True
    )
    score = res['matches'][0]['score']
    title = res['matches'][0]['metadata']['text']
    link = res['matches'][0]['metadata']['claimReviewUrl']
    return pd.Series([score, title, link], index=['score', 'title', 'link'])

## Get score, title, link for each querytext in corpus
import time
from pinecone.grpc import PineconeGRPC
pc = PineconeGRPC(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0") # <-- make sure to set this)
index = pc.Index(
    name="oc-hybrid-library-index",
    pool_threads=50, # <-- make sure to set this
)

### TIMING
start_time = time.time()

df[['score', 'title', 'link']] = df['claim'].apply(get_score_title_link, args=(pc, index)) #send the claim column to be scored.

elapsed_time = time.time() - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")


######## END TIMING