|
|
|
|
|
from pinecone.grpc import PineconeGRPC as Pinecone |
|
import os |
|
import pandas as pd |
|
import numpy as np |
|
from pinecone import ServerlessSpec |
|
from pinecone_text.sparse import BM25Encoder |
|
import sys |
|
sys.path.append('src/python') |
|
import DataLoader |
|
pc = Pinecone(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0", pool_threads=50) |
|
|
|
|
|
df = pd.read_csv('data/fact_check_latest.csv') |
|
|
|
df = df.drop_duplicates(subset=['text']) |
|
|
|
df = df.dropna(subset=['text']) |
|
|
|
df['claimReviewUrl'] = df['claimReviewUrl'].fillna('') |
|
|
|
|
|
|
|
bm25, newdf = DataLoader.create_sparse_embeds(pc, df) |
|
|
|
metadata = df[['text', 'claimReviewUrl']].to_dict(orient='records') |
|
newdf.loc[:, 'metadata'] = metadata |
|
|
|
|
|
sparse_lengths = [len(x) for x in newdf['sparse_values']] |
|
|
|
newdf = newdf[np.array(sparse_lengths) != 0].reset_index(drop=True) |
|
vecs = DataLoader.create_sparse_dense_dict(newdf) |
|
index = pc.Index("oc-hybrid-library-index") |
|
for i in range(0, len(vecs), 400): |
|
end_index = min(i + 400, len(vecs)) |
|
index.upsert(vecs[i:end_index], namespace="expanded-fact-checks") |
|
print(f"Upserted vectors") |
|
|
|
|
|
|
|
df = pd.read_csv('data/truthseeker_subsample.csv') |
|
corpus = df['claim'].tolist() |
|
|
|
""" |
|
## Function query, return score, title, link |
|
Example: get_score_title_link(corpus[0], pc, index) |
|
""" |
|
def get_score_title_link(querytext, pc, index): |
|
queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext) |
|
empty_sparse = DataLoader.empty_sparse_vector() |
|
res = index.query( |
|
top_k=1, |
|
namespace="expanded-fact-checks", |
|
vector=queryembed, |
|
sparse_vector=empty_sparse, |
|
include_metadata=True |
|
) |
|
score = res['matches'][0]['score'] |
|
title = res['matches'][0]['metadata']['text'] |
|
link = res['matches'][0]['metadata']['claimReviewUrl'] |
|
return pd.Series([score, title, link], index=['score', 'title', 'link']) |
|
|
|
|
|
import time |
|
from pinecone.grpc import PineconeGRPC |
|
pc = PineconeGRPC(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0") |
|
index = pc.Index( |
|
name="oc-hybrid-library-index", |
|
pool_threads=50, |
|
) |
|
|
|
|
|
start_time = time.time() |
|
|
|
df[['score', 'title', 'link']] = df['claim'].apply(get_score_title_link, args=(pc, index)) |
|
|
|
elapsed_time = time.time() - start_time |
|
print(f"Time taken: {elapsed_time:.2f} seconds") |
|
|
|
|
|
|
|
|