Spaces:
Sleeping
Sleeping
# pip install pinecone[grpc] | |
#from pinecone import Pinecone | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
import os | |
import pandas as pd | |
import numpy as np | |
from pinecone import ServerlessSpec | |
from pinecone_text.sparse import BM25Encoder | |
## ID generation | |
from sqids import Sqids | |
sqids = Sqids() | |
####### | |
#import protobuf_module_pb2 | |
#pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") | |
##### EMBEDDINGS AND ENCODINGS | |
""" | |
## Embed in the inference API | |
df = pd.read_csv('data/Indicator_Test.csv') | |
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") | |
model = "multilingual-e5-large" | |
embeddings = bulk_embed(pc, model, df[1:96]) | |
""" | |
def bulk_embed(pc, model, data, textcol='text'): | |
embeddings = pc.inference.embed( | |
model, | |
inputs=[x for x in data[textcol]], | |
parameters={ | |
"input_type": "passage" | |
} | |
) | |
return embeddings | |
def join_chunked_results(embeddings): | |
result = [] | |
for chunk in embeddings: | |
for emblist in chunk.data: | |
result.append(emblist["values"]) | |
return result | |
""" | |
## Chunk and embed in the inference API | |
df = pd.read_csv('data/climate_test.csv') | |
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") | |
model = "multilingual-e5-large" | |
embeddings = chunk_and_embed(pc, model, df) | |
## Upgrade this function to return a dataframe with the Embeddings as a new column | |
""" | |
def chunk_and_embed(pc, model, data, chunk_size=96, textcol='text'): | |
embeddings = [] | |
for i in range(0, len(data), chunk_size): | |
chunk = data[i:min(i + chunk_size, len(data))] | |
embeddings.append(bulk_embed(pc, model, chunk, textcol)) | |
chunked_embeddings = join_chunked_results(embeddings) | |
data['Embeddings'] = chunked_embeddings | |
data['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(data))] | |
return data | |
""" | |
## Query the embeddings | |
query = "What is the impact of climate change on the economy?" | |
embeddings = query_embed(pc, model, query) | |
""" | |
def query_embed(pc, model, query): | |
embeddings = pc.inference.embed( | |
model, | |
inputs=query, | |
parameters={ | |
"input_type": "query" | |
} | |
) | |
return embeddings[0]['values'] | |
""" | |
### Sparse vector encoding | |
- write a function to embed | |
from pinecone_text.sparse import BM25Encoder | |
corpus = ["The quick brown fox jumps over the lazy dog", | |
"The lazy dog is brown", | |
"The fox is brown"] | |
# Initialize BM25 and fit the corpus. | |
bm25 = BM25Encoder() | |
#bm25.fit(corpus) | |
#bm25 = BM25Encoder.default() | |
doc_sparse_vector = bm25.encode_documents("The brown fox is quick") | |
vector, bm25 = encode_documents(corpus) | |
""" | |
def encode_documents(corpus): | |
bm25 = BM25Encoder() | |
bm25.fit(corpus) | |
doc_sparse_vector = bm25.encode_documents(corpus) | |
return doc_sparse_vector, bm25 | |
def encode_query(bm25, query): | |
query_sparse_vector = bm25.encode_queries(query) | |
return query_sparse_vector | |
""" | |
## Generate format of sparse-dense vectors | |
# Example usage | |
df = pd.read_csv('data/Indicator_Test.csv') | |
df = df.head(3) | |
newdf = create_sparse_embeds(df) | |
newdf['metadata'] = newdf.metadata.to_list() | |
""" | |
def create_sparse_embeds(pc, df, textcol='text', idcol='id', model="multilingual-e5-large"): | |
endocs, bm25 = encode_documents(df[textcol].to_list()) | |
chunk_and_embed(pc, model, df) # this is an in-place operation | |
# rename Embeddings to values | |
df.rename(columns={'Embeddings': 'values'}, inplace=True) | |
df['sparse_values'] = [x['values'] for x in endocs] | |
df['indices'] = [x['indices'] for x in endocs] | |
df['metadata'] = df.drop(columns=[idcol, 'values', 'indices', 'sparse_values']).to_dict(orient='records') | |
df = df[[idcol, 'values', 'metadata', 'indices', 'sparse_values']] | |
return bm25, df | |
""" | |
## Generate format of sparse-dense vectors | |
# Example usage | |
data = { | |
'id': ['vec1', 'vec2'], | |
'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]], | |
'metadata': [{'genre': 'drama', 'text': 'this'}, {'genre': 'action'}], | |
'sparse_indices': [[10, 45, 16], [12, 34, 56]], | |
'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]] | |
} | |
df = pd.DataFrame(data) | |
sparse_dense_dicts = create_sparse_dense_dict(df) | |
vecs = create_sparse_dense_vectors_from_df(df) | |
index.upsert(vecs, namespace="example-namespace") | |
# Example usage | |
df = pd.read_csv('data/Indicator_Test.csv') | |
df = df.head(3) | |
newdf = create_sparse_embeds(df) | |
metadata = df[['text', 'label']].to_dict(orient='records') | |
newdf['metadata'] = metadata | |
vecs = create_sparse_dense_dict(newdf) | |
index.upsert(vecs, namespace="example-namespace") | |
""" | |
def create_sparse_dense_dict(df, id_col='id', values_col='values', metadata_col='metadata', sparse_indices_col='indices', sparse_values_col='sparse_values'): | |
result = [] | |
for _, row in df.iterrows(): | |
vector_dict = { | |
'id': row[id_col], | |
'values': row[values_col], | |
'metadata': row[metadata_col], | |
'sparse_values': { | |
'indices': row[sparse_indices_col], | |
'values': row[sparse_values_col] | |
} | |
} | |
result.append(vector_dict) | |
return result | |
############ UPSERTING DATA | |
def create_index(pc, name, dimension, metric, cloud, region): | |
pc.create_index( | |
name=name, | |
dimension=dimension, | |
metric=metric, | |
spec=ServerlessSpec( | |
cloud=cloud, | |
region=region | |
) | |
) | |
#pc.delete_index("example-index") | |
#index = pc.Index("test-index") | |
""" | |
## Create vectors from a DataFrame to be uploaded to Pinecone | |
import pandas as pd | |
# Create a sample DataFrame | |
data = { | |
'Embeddings': [ | |
[0.1, 0.2, 0.3, 0.4], | |
[0.2, 0.3, 0.4, 0.5] | |
], | |
'id': ['vec1', 'vec2'], | |
'genre': ['drama', 'action'] | |
} | |
df = pd.DataFrame(data) | |
vecs = create_vectors_from_df(df) | |
# Upload the vectors to Pinecone | |
index.upsert( | |
vectors=vecs, | |
namespace="example-namespace" | |
) | |
""" | |
def create_vectors_from_df(df): | |
vectors = [] | |
for _, row in df.iterrows(): | |
vectors.append((row['id'], row['Embeddings'], row.drop(['Embeddings', 'id']).to_dict())) | |
return vectors | |
def chunk_upload_vectors(index, vectors, namespace="example-namespace", chunk_size=1000): | |
for i in range(0, len(vectors), chunk_size): | |
chunk = vectors[i:min(i + chunk_size, len(vectors))] | |
index.upsert( | |
vectors=chunk, | |
namespace=namespace | |
) | |
""" | |
## Working Example 2 | |
df = pd.read_csv('data/Indicator_Test.csv') | |
dfe = DataLoader.chunk_and_embed(pc, model, df) | |
# Keep only text, embeddings, id | |
dfmin = dfe[['text', 'Embeddings', 'id', 'label']] | |
DataLoader.chunk_df_and_upsert(index, dfmin, namespace="indicator-test-namespace", chunk_size=96) | |
""" | |
def chunk_df_and_upsert(index, df, namespace="new-namespace", chunk_size=1000): | |
vectors = create_vectors_from_df(df) | |
chunk_upload_vectors(index, vectors, namespace, chunk_size) | |
#### QUERYING DATA | |
""" | |
namespace = "namespace" | |
vector = [0.1, 0.2, 0.3, 0.4] | |
top_k = 3 | |
include_values = True | |
""" | |
def query_data(index, namespace, vector, top_k=3, include_values=True): | |
out = index.query( | |
namespace=namespace, | |
vector=vector.tolist(), | |
top_k=top_k, | |
include_values=include_values | |
) | |
return out | |
""" | |
Example: | |
""" | |
def query_data_with_sparse(index, namespace, vector, sparse_vector, top_k=5, include_values=True, include_metadata=True): | |
out = index.query( | |
namespace=namespace, | |
vector=vector, | |
sparse_vector=sparse_vector, | |
top_k=top_k, | |
include_metadata=include_metadata, | |
include_values=include_values | |
) | |
return out | |
# create sparse vector with zero weighting | |
def empty_sparse_vector(): | |
return { | |
'indices': [1], | |
'values': [0.0] | |
} | |
""" | |
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") | |
index = pc.Index("test-index") | |
namespace = "test-namespace" | |
vector = np.random.rand(1024) | |
top_k = 3 | |
include_values = True | |
filter={ | |
"label": {"$lt": 2} | |
} | |
query_data_with_filter(index, namespace, vector, top_k, include_values, filter) | |
""" | |
def query_data_with_filter(index, namespace, vector, top_k=3, include_values=True, filter=None): | |
out = index.query( | |
namespace=namespace, | |
vector=vector.tolist(), | |
top_k=top_k, | |
include_values=include_values, | |
filter=filter | |
) | |
return out | |
""" | |
pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") | |
ids = ["UkfgLgeYW9wo", "GkkzUYYOcooB"] | |
indexname = "ostreacultura-v1" | |
namespace = "cards-data" | |
index = pc.Index(indexname) | |
DL.fetch_data(index, ids, namespace) | |
""" | |
def fetch_data(index, ids, namespace): | |
out = index.fetch(ids=ids, namespace=namespace) | |
return out | |
def get_all_ids_from_namespace(index, namespace): | |
ids = index.list(namespace=namespace) | |
return ids | |
""" | |
## Hybrid search weighting - Alpa is equal to the weight of the dense vector | |
dense = [0.1, 0.2, 0.3, 0.4] | |
sparse_vector={ | |
'indices': [10, 45, 16], | |
'values': [0.5, 0.5, 0.2] | |
} | |
dense, sparse = hybrid_score_norm(dense, sparse, alpha=1.0) | |
""" | |
def hybrid_score_norm(dense, sparse, alpha: float): | |
"""Hybrid score using a convex combination | |
alpha * dense + (1 - alpha) * sparse | |
Args: | |
dense: Array of floats representing | |
sparse: a dict of `indices` and `values` | |
alpha: scale between 0 and 1 | |
""" | |
if alpha < 0 or alpha > 1: | |
raise ValueError("Alpha must be between 0 and 1") | |
hs = { | |
'indices': sparse['indices'], | |
'values': [v * (1 - alpha) for v in sparse['values']] | |
} | |
return [v * alpha for v in dense], hs | |
############# |