|
|
|
|
|
from pinecone.grpc import PineconeGRPC as Pinecone |
|
import os |
|
import pandas as pd |
|
import numpy as np |
|
from pinecone import ServerlessSpec |
|
from pinecone_text.sparse import BM25Encoder |
|
|
|
|
|
from sqids import Sqids |
|
sqids = Sqids() |
|
|
|
|
|
|
|
|
|
|
|
""" |
|
## Embed in the inference API |
|
df = pd.read_csv('data/Indicator_Test.csv') |
|
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") |
|
model = "multilingual-e5-large" |
|
embeddings = bulk_embed(pc, model, df[1:96]) |
|
|
|
""" |
|
def bulk_embed(pc, model, data, textcol='text'): |
|
embeddings = pc.inference.embed( |
|
model, |
|
inputs=[x for x in data[textcol]], |
|
parameters={ |
|
"input_type": "passage" |
|
} |
|
) |
|
return embeddings |
|
|
|
|
|
def join_chunked_results(embeddings): |
|
result = [] |
|
for chunk in embeddings: |
|
for emblist in chunk.data: |
|
result.append(emblist["values"]) |
|
return result |
|
|
|
""" |
|
## Chunk and embed in the inference API |
|
df = pd.read_csv('data/climate_test.csv') |
|
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") |
|
model = "multilingual-e5-large" |
|
embeddings = chunk_and_embed(pc, model, df) |
|
## Upgrade this function to return a dataframe with the Embeddings as a new column |
|
|
|
""" |
|
def chunk_and_embed(pc, model, data, chunk_size=96, textcol='text'): |
|
embeddings = [] |
|
for i in range(0, len(data), chunk_size): |
|
chunk = data[i:min(i + chunk_size, len(data))] |
|
embeddings.append(bulk_embed(pc, model, chunk, textcol)) |
|
chunked_embeddings = join_chunked_results(embeddings) |
|
data['Embeddings'] = chunked_embeddings |
|
data['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(data))] |
|
return data |
|
|
|
""" |
|
## Query the embeddings |
|
query = "What is the impact of climate change on the economy?" |
|
embeddings = query_embed(pc, model, query) |
|
""" |
|
def query_embed(pc, model, query): |
|
embeddings = pc.inference.embed( |
|
model, |
|
inputs=query, |
|
parameters={ |
|
"input_type": "query" |
|
} |
|
) |
|
return embeddings[0]['values'] |
|
|
|
""" |
|
### Sparse vector encoding |
|
- write a function to embed |
|
from pinecone_text.sparse import BM25Encoder |
|
|
|
corpus = ["The quick brown fox jumps over the lazy dog", |
|
"The lazy dog is brown", |
|
"The fox is brown"] |
|
|
|
# Initialize BM25 and fit the corpus. |
|
bm25 = BM25Encoder() |
|
#bm25.fit(corpus) |
|
#bm25 = BM25Encoder.default() |
|
doc_sparse_vector = bm25.encode_documents("The brown fox is quick") |
|
|
|
vector, bm25 = encode_documents(corpus) |
|
""" |
|
def encode_documents(corpus): |
|
bm25 = BM25Encoder() |
|
bm25.fit(corpus) |
|
doc_sparse_vector = bm25.encode_documents(corpus) |
|
return doc_sparse_vector, bm25 |
|
|
|
def encode_query(bm25, query): |
|
query_sparse_vector = bm25.encode_queries(query) |
|
return query_sparse_vector |
|
|
|
""" |
|
## Generate format of sparse-dense vectors |
|
# Example usage |
|
df = pd.read_csv('data/Indicator_Test.csv') |
|
df = df.head(3) |
|
newdf = create_sparse_embeds(df) |
|
newdf['metadata'] = newdf.metadata.to_list() |
|
|
|
""" |
|
def create_sparse_embeds(pc, df, textcol='text', idcol='id', model="multilingual-e5-large"): |
|
endocs, bm25 = encode_documents(df[textcol].to_list()) |
|
chunk_and_embed(pc, model, df) |
|
|
|
df.rename(columns={'Embeddings': 'values'}, inplace=True) |
|
df['sparse_values'] = [x['values'] for x in endocs] |
|
df['indices'] = [x['indices'] for x in endocs] |
|
df['metadata'] = df.drop(columns=[idcol, 'values', 'indices', 'sparse_values']).to_dict(orient='records') |
|
df = df[[idcol, 'values', 'metadata', 'indices', 'sparse_values']] |
|
return bm25, df |
|
|
|
""" |
|
## Generate format of sparse-dense vectors |
|
# Example usage |
|
data = { |
|
'id': ['vec1', 'vec2'], |
|
'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]], |
|
'metadata': [{'genre': 'drama', 'text': 'this'}, {'genre': 'action'}], |
|
'sparse_indices': [[10, 45, 16], [12, 34, 56]], |
|
'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]] |
|
} |
|
|
|
df = pd.DataFrame(data) |
|
sparse_dense_dicts = create_sparse_dense_dict(df) |
|
vecs = create_sparse_dense_vectors_from_df(df) |
|
index.upsert(vecs, namespace="example-namespace") |
|
|
|
|
|
# Example usage |
|
df = pd.read_csv('data/Indicator_Test.csv') |
|
df = df.head(3) |
|
newdf = create_sparse_embeds(df) |
|
metadata = df[['text', 'label']].to_dict(orient='records') |
|
newdf['metadata'] = metadata |
|
vecs = create_sparse_dense_dict(newdf) |
|
index.upsert(vecs, namespace="example-namespace") |
|
|
|
""" |
|
def create_sparse_dense_dict(df, id_col='id', values_col='values', metadata_col='metadata', sparse_indices_col='indices', sparse_values_col='sparse_values'): |
|
result = [] |
|
|
|
for _, row in df.iterrows(): |
|
vector_dict = { |
|
'id': row[id_col], |
|
'values': row[values_col], |
|
'metadata': row[metadata_col], |
|
'sparse_values': { |
|
'indices': row[sparse_indices_col], |
|
'values': row[sparse_values_col] |
|
} |
|
} |
|
result.append(vector_dict) |
|
|
|
return result |
|
|
|
|
|
|
|
|
|
def create_index(pc, name, dimension, metric, cloud, region): |
|
pc.create_index( |
|
name=name, |
|
dimension=dimension, |
|
metric=metric, |
|
spec=ServerlessSpec( |
|
cloud=cloud, |
|
region=region |
|
) |
|
) |
|
|
|
|
|
|
|
|
|
|
|
""" |
|
## Create vectors from a DataFrame to be uploaded to Pinecone |
|
import pandas as pd |
|
|
|
# Create a sample DataFrame |
|
data = { |
|
'Embeddings': [ |
|
[0.1, 0.2, 0.3, 0.4], |
|
[0.2, 0.3, 0.4, 0.5] |
|
], |
|
'id': ['vec1', 'vec2'], |
|
'genre': ['drama', 'action'] |
|
} |
|
df = pd.DataFrame(data) |
|
|
|
vecs = create_vectors_from_df(df) |
|
|
|
# Upload the vectors to Pinecone |
|
index.upsert( |
|
vectors=vecs, |
|
namespace="example-namespace" |
|
) |
|
""" |
|
def create_vectors_from_df(df): |
|
vectors = [] |
|
for _, row in df.iterrows(): |
|
vectors.append((row['id'], row['Embeddings'], row.drop(['Embeddings', 'id']).to_dict())) |
|
return vectors |
|
|
|
def chunk_upload_vectors(index, vectors, namespace="example-namespace", chunk_size=1000): |
|
for i in range(0, len(vectors), chunk_size): |
|
chunk = vectors[i:min(i + chunk_size, len(vectors))] |
|
index.upsert( |
|
vectors=chunk, |
|
namespace=namespace |
|
) |
|
|
|
""" |
|
## Working Example 2 |
|
|
|
df = pd.read_csv('data/Indicator_Test.csv') |
|
dfe = DataLoader.chunk_and_embed(pc, model, df) |
|
# Keep only text, embeddings, id |
|
dfmin = dfe[['text', 'Embeddings', 'id', 'label']] |
|
DataLoader.chunk_df_and_upsert(index, dfmin, namespace="indicator-test-namespace", chunk_size=96) |
|
|
|
""" |
|
def chunk_df_and_upsert(index, df, namespace="new-namespace", chunk_size=1000): |
|
vectors = create_vectors_from_df(df) |
|
chunk_upload_vectors(index, vectors, namespace, chunk_size) |
|
|
|
|
|
""" |
|
namespace = "namespace" |
|
vector = [0.1, 0.2, 0.3, 0.4] |
|
top_k = 3 |
|
include_values = True |
|
""" |
|
def query_data(index, namespace, vector, top_k=3, include_values=True): |
|
out = index.query( |
|
namespace=namespace, |
|
vector=vector.tolist(), |
|
top_k=top_k, |
|
include_values=include_values |
|
) |
|
return out |
|
|
|
""" |
|
Example: |
|
|
|
""" |
|
def query_data_with_sparse(index, namespace, vector, sparse_vector, top_k=5, include_values=True, include_metadata=True): |
|
out = index.query( |
|
namespace=namespace, |
|
vector=vector, |
|
sparse_vector=sparse_vector, |
|
top_k=top_k, |
|
include_metadata=include_metadata, |
|
include_values=include_values |
|
) |
|
return out |
|
|
|
|
|
def empty_sparse_vector(): |
|
return { |
|
'indices': [1], |
|
'values': [0.0] |
|
} |
|
|
|
|
|
""" |
|
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") |
|
index = pc.Index("test-index") |
|
namespace = "test-namespace" |
|
vector = np.random.rand(1024) |
|
top_k = 3 |
|
include_values = True |
|
filter={ |
|
"label": {"$lt": 2} |
|
} |
|
query_data_with_filter(index, namespace, vector, top_k, include_values, filter) |
|
""" |
|
def query_data_with_filter(index, namespace, vector, top_k=3, include_values=True, filter=None): |
|
out = index.query( |
|
namespace=namespace, |
|
vector=vector.tolist(), |
|
top_k=top_k, |
|
include_values=include_values, |
|
filter=filter |
|
) |
|
return out |
|
|
|
""" |
|
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") |
|
ids = ["UkfgLgeYW9wo", "GkkzUYYOcooB"] |
|
indexname = "ostreacultura-v1" |
|
namespace = "cards-data" |
|
index = pc.Index(indexname) |
|
DL.fetch_data(index, ids, namespace) |
|
|
|
""" |
|
def fetch_data(index, ids, namespace): |
|
out = index.fetch(ids=ids, namespace=namespace) |
|
return out |
|
|
|
|
|
def get_all_ids_from_namespace(index, namespace): |
|
ids = index.list(namespace=namespace) |
|
return ids |
|
|
|
""" |
|
## Hybrid search weighting - Alpa is equal to the weight of the dense vector |
|
dense = [0.1, 0.2, 0.3, 0.4] |
|
sparse_vector={ |
|
'indices': [10, 45, 16], |
|
'values': [0.5, 0.5, 0.2] |
|
} |
|
dense, sparse = hybrid_score_norm(dense, sparse, alpha=1.0) |
|
""" |
|
def hybrid_score_norm(dense, sparse, alpha: float): |
|
"""Hybrid score using a convex combination |
|
|
|
alpha * dense + (1 - alpha) * sparse |
|
|
|
Args: |
|
dense: Array of floats representing |
|
sparse: a dict of `indices` and `values` |
|
alpha: scale between 0 and 1 |
|
""" |
|
if alpha < 0 or alpha > 1: |
|
raise ValueError("Alpha must be between 0 and 1") |
|
hs = { |
|
'indices': sparse['indices'], |
|
'values': [v * (1 - alpha) for v in sparse['values']] |
|
} |
|
return [v * alpha for v in dense], hs |
|
|
|
|