Spaces:

stefanjwojcik
/

misinfo_detection_app

Sleeping

App Files Files Community

misinfo_detection_app / notebooks /hybrid_example.py

stefanjwojcik

Upload 5 files

734777a verified 5 months ago

raw

history blame

6.34 kB

	### Example code
	from pinecone.grpc import PineconeGRPC as Pinecone
	import os
	import pandas as pd
	import numpy as np
	from pinecone import ServerlessSpec
	from pinecone_text.sparse import BM25Encoder
	import sys
	sys.path.append('src/python')
	import DataLoader

	####### VERY MINIMAL NONSENSE DATA
	data = {
	'id': ['vec1', 'vec2'],
	'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]],
	'metadata': [{'text': 'drama'}, {'text': 'action'}],
	'sparse_indices': [[10, 45, 16], [12, 34, 56]],
	'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]]
	}

	pc.create_index(
	name="oc-hybrid-index",
	dimension=3,
	metric="dotproduct",
	spec=ServerlessSpec(
	cloud="aws",
	region="us-east-1"
	)
	)

	index = pc.index("oc-hybrid-index")

	vecs = create_sparse_dense_dict(df)

	index.upsert(vecs, namespace="example-namespace")

	######################## Indicator Test Data

	pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
	pc.delete_index("oc-hybrid-index")

	pc.create_index(
	name="oc-hybrid-index",
	dimension=1024,
	metric="dotproduct",
	spec=ServerlessSpec(
	cloud="aws",
	region="us-east-1"
	)
	)

	index = pc.Index("oc-hybrid-index")

	## Upsert Indicator Test Data
	df = pd.read_csv('data/Indicator_Test.csv')
	## get top three rows
	df = df.head(3)
	# get text and MessageID
	# Example usage
	df = pd.read_csv('data/Indicator_Test.csv')
	df = df.head(3)
	bm25, newdf = create_sparse_embeds(df)
	metadata = df[['text', 'label']].to_dict(orient='records')
	newdf['metadata'] = metadata
	vecs = create_sparse_dense_dict(newdf)
	index.upsert(vecs, namespace="example-namespace")
	## Query the hybrid index
	querytext = "immigrants are invading the border"
	queryembed = query_embed(pc, "multilingual-e5-large", querytext)
	query_sparse_vector = bm25.encode_documents(querytext)

	query_response = index.query(
	top_k=1,
	namespace="example-namespace",
	vector=queryembed,
	sparse_vector=query_sparse_vector,
	include_metadata=True
	)


	## Now create embeddings
	from pinecone import Pinecone
	pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0")
	model = "multilingual-e5-large"
	DataLoader.chunk_and_embed(pc, model, df)
	#df['Embeddings'] = [np.random.random(4) for x in range(len(df))]
	# rename embeddings to values
	df.rename(columns={'Embeddings': 'values'}, inplace=True)
	#df['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(df))]
	## now, create metadata column to capture any column not including id, values, indices, and sparse_values
	df['metadata'] = df.drop(columns=['id', 'values', 'indices', 'sparse_values']).to_dict(orient='records')
	# only keep ids, values, metadata, indices, and sparse_values
	df = df[['id', 'values', 'metadata', 'indices', 'sparse_values']]

	vecs = create_sparse_dense_dict(df)

	pc.create_index(
	name="oc-hybrid-indexv2",
	dimension=1024,
	metric="dotproduct",
	spec=ServerlessSpec(
	cloud="aws",
	region="us-east-1"
	)
	)

	index = pc.Index("oc-hybrid-indexv2")
	index.upsert(vecs, namespace="example-namespace")

	## QUERY
	query_response = index.query(
	top_k=10,
	vector=[0.1, 0.2, 0.3],
	sparse_vector={
	'indices': [10, 45, 16],
	'values': [0.5, 0.5, 0.2]
	}
	)

	################
	query = "test border patrol"
	query_sparse_vector = encode_query(bm25, query)

	query_response = index.query(
	top_k=1,
	namespace="example-namespace",
	vector=np.random.random(1024).tolist(),
	sparse_vector=query_sparse_vector
	)
	################
	query = "ireland"
	query_sparse_vector = encode_query(bm25, query)

	query_response = index.query(
	top_k=1,
	namespace="example-namespace",
	vector=np.random.random(1024).tolist(),
	sparse_vector=query_sparse_vector
	)


	################## Function to create sparse and dense vectors
	from tqdm.auto import tqdm

	# Remove columns you dont want to encode
	df = pd.read_csv('data/Indicator_Test.csv')
	metadata = df

	batch_size = 200

	# convert all columns to string
	metadata = metadata.astype(str)

	#cols_to_remove = ['channelID', 'MessageID', 'AccountID', 'label', 'contexts', 'topics', 'weak topics', 'indicators']

	for i in tqdm(range(0, len(df), batch_size)):
	# find end of batch
	i_end = min(i+batch_size, len(df))
	# extract metadata batch
	meta_batch = metadata.iloc[i:i_end]
	meta_dict = meta_batch.to_dict(orient="records")
	# concatenate all metadata field except for id and year to form a single string
	meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()]
	# extract image batch
	img_batch = images[i:i_end]
	# create sparse BM25 vectors
	sparse_embeds = bm25.encode_documents([text for text in meta_batch])
	# create dense vectors
	dense_embeds = model.encode(img_batch).tolist()
	# create unique IDs
	ids = [str(x) for x in range(i, i_end)]

	upserts = []
	# loop through the data and create dictionaries for uploading documents to pinecone index
	for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict):
	upserts.append({
	'id': _id,
	'sparse_values': sparse,
	'values': dense,
	'metadata': meta
	})
	# upload the documents to the new hybrid index
	index.upsert(upserts)

	# Create an upsert function for hybrid vectors
	def upsert_hybrid_vectors(index, df, model, bm25, batch_size=200, cols_to_remove=['id', 'year']):
	metadata = df.remove_columns("image")

	for i in tqdm(range(0, len(df), batch_size)):
	i_end = min(i+batch_size, len(df))
	meta_batch = metadata.iloc[i:i_end]
	meta_dict = meta_batch.to_dict(orient="records")
	meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(cols_to_remove)].values.tolist()]
	text_batch = df
	sparse_embeds = bm25.encode_documents([text for text in meta_batch])
	dense_embeds = model.encode(text_batch).tolist()
	ids = [str(x) for x in range(i, i_end)]

	upserts = []
	for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict):
	upserts.append({
	'id': _id,
	'sparse_values': sparse,
	'values': dense,
	'metadata': meta
	})
	index.upsert(upserts)

	# show index description after uploading the documents
	index.describe_index_stats()