# pip install pinecone[grpc] #from pinecone import Pinecone from pinecone.grpc import PineconeGRPC as Pinecone import os import pandas as pd import numpy as np from pinecone import ServerlessSpec from pinecone_text.sparse import BM25Encoder ## ID generation from sqids import Sqids sqids = Sqids() ####### #import protobuf_module_pb2 #pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") ##### EMBEDDINGS AND ENCODINGS """ ## Embed in the inference API df = pd.read_csv('data/Indicator_Test.csv') pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") model = "multilingual-e5-large" embeddings = bulk_embed(pc, model, df[1:96]) """ def bulk_embed(pc, model, data, textcol='text'): embeddings = pc.inference.embed( model, inputs=[x for x in data[textcol]], parameters={ "input_type": "passage" } ) return embeddings def join_chunked_results(embeddings): result = [] for chunk in embeddings: for emblist in chunk.data: result.append(emblist["values"]) return result """ ## Chunk and embed in the inference API df = pd.read_csv('data/climate_test.csv') pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") model = "multilingual-e5-large" embeddings = chunk_and_embed(pc, model, df) ## Upgrade this function to return a dataframe with the Embeddings as a new column """ def chunk_and_embed(pc, model, data, chunk_size=96, textcol='text'): embeddings = [] for i in range(0, len(data), chunk_size): chunk = data[i:min(i + chunk_size, len(data))] embeddings.append(bulk_embed(pc, model, chunk, textcol)) chunked_embeddings = join_chunked_results(embeddings) data['Embeddings'] = chunked_embeddings data['id'] = [sqids.encode([i, i+1, i+2]) for i in range(len(data))] return data """ ## Query the embeddings query = "What is the impact of climate change on the economy?" embeddings = query_embed(pc, model, query) """ def query_embed(pc, model, query): embeddings = pc.inference.embed( model, inputs=query, parameters={ "input_type": "query" } ) return embeddings[0]['values'] """ ### Sparse vector encoding - write a function to embed from pinecone_text.sparse import BM25Encoder corpus = ["The quick brown fox jumps over the lazy dog", "The lazy dog is brown", "The fox is brown"] # Initialize BM25 and fit the corpus. bm25 = BM25Encoder() #bm25.fit(corpus) #bm25 = BM25Encoder.default() doc_sparse_vector = bm25.encode_documents("The brown fox is quick") vector, bm25 = encode_documents(corpus) """ def encode_documents(corpus): bm25 = BM25Encoder() bm25.fit(corpus) doc_sparse_vector = bm25.encode_documents(corpus) return doc_sparse_vector, bm25 def encode_query(bm25, query): query_sparse_vector = bm25.encode_queries(query) return query_sparse_vector """ ## Generate format of sparse-dense vectors # Example usage df = pd.read_csv('data/Indicator_Test.csv') df = df.head(3) newdf = create_sparse_embeds(df) newdf['metadata'] = newdf.metadata.to_list() """ def create_sparse_embeds(pc, df, textcol='text', idcol='id', model="multilingual-e5-large"): endocs, bm25 = encode_documents(df[textcol].to_list()) chunk_and_embed(pc, model, df) # this is an in-place operation # rename Embeddings to values df.rename(columns={'Embeddings': 'values'}, inplace=True) df['sparse_values'] = [x['values'] for x in endocs] df['indices'] = [x['indices'] for x in endocs] df['metadata'] = df.drop(columns=[idcol, 'values', 'indices', 'sparse_values']).to_dict(orient='records') df = df[[idcol, 'values', 'metadata', 'indices', 'sparse_values']] return bm25, df """ ## Generate format of sparse-dense vectors # Example usage data = { 'id': ['vec1', 'vec2'], 'values': [[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]], 'metadata': [{'genre': 'drama', 'text': 'this'}, {'genre': 'action'}], 'sparse_indices': [[10, 45, 16], [12, 34, 56]], 'sparse_values': [[0.5, 0.5, 0.2], [0.3, 0.4, 0.1]] } df = pd.DataFrame(data) sparse_dense_dicts = create_sparse_dense_dict(df) vecs = create_sparse_dense_vectors_from_df(df) index.upsert(vecs, namespace="example-namespace") # Example usage df = pd.read_csv('data/Indicator_Test.csv') df = df.head(3) newdf = create_sparse_embeds(df) metadata = df[['text', 'label']].to_dict(orient='records') newdf['metadata'] = metadata vecs = create_sparse_dense_dict(newdf) index.upsert(vecs, namespace="example-namespace") """ def create_sparse_dense_dict(df, id_col='id', values_col='values', metadata_col='metadata', sparse_indices_col='indices', sparse_values_col='sparse_values'): result = [] for _, row in df.iterrows(): vector_dict = { 'id': row[id_col], 'values': row[values_col], 'metadata': row[metadata_col], 'sparse_values': { 'indices': row[sparse_indices_col], 'values': row[sparse_values_col] } } result.append(vector_dict) return result ############ UPSERTING DATA def create_index(pc, name, dimension, metric, cloud, region): pc.create_index( name=name, dimension=dimension, metric=metric, spec=ServerlessSpec( cloud=cloud, region=region ) ) #pc.delete_index("example-index") #index = pc.Index("test-index") """ ## Create vectors from a DataFrame to be uploaded to Pinecone import pandas as pd # Create a sample DataFrame data = { 'Embeddings': [ [0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5] ], 'id': ['vec1', 'vec2'], 'genre': ['drama', 'action'] } df = pd.DataFrame(data) vecs = create_vectors_from_df(df) # Upload the vectors to Pinecone index.upsert( vectors=vecs, namespace="example-namespace" ) """ def create_vectors_from_df(df): vectors = [] for _, row in df.iterrows(): vectors.append((row['id'], row['Embeddings'], row.drop(['Embeddings', 'id']).to_dict())) return vectors def chunk_upload_vectors(index, vectors, namespace="example-namespace", chunk_size=1000): for i in range(0, len(vectors), chunk_size): chunk = vectors[i:min(i + chunk_size, len(vectors))] index.upsert( vectors=chunk, namespace=namespace ) """ ## Working Example 2 df = pd.read_csv('data/Indicator_Test.csv') dfe = DataLoader.chunk_and_embed(pc, model, df) # Keep only text, embeddings, id dfmin = dfe[['text', 'Embeddings', 'id', 'label']] DataLoader.chunk_df_and_upsert(index, dfmin, namespace="indicator-test-namespace", chunk_size=96) """ def chunk_df_and_upsert(index, df, namespace="new-namespace", chunk_size=1000): vectors = create_vectors_from_df(df) chunk_upload_vectors(index, vectors, namespace, chunk_size) #### QUERYING DATA """ namespace = "namespace" vector = [0.1, 0.2, 0.3, 0.4] top_k = 3 include_values = True """ def query_data(index, namespace, vector, top_k=3, include_values=True): out = index.query( namespace=namespace, vector=vector.tolist(), top_k=top_k, include_values=include_values ) return out """ Example: """ def query_data_with_sparse(index, namespace, vector, sparse_vector, top_k=5, include_values=True, include_metadata=True): out = index.query( namespace=namespace, vector=vector, sparse_vector=sparse_vector, top_k=top_k, include_metadata=include_metadata, include_values=include_values ) return out # create sparse vector with zero weighting def empty_sparse_vector(): return { 'indices': [1], 'values': [0.0] } """ pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") index = pc.Index("test-index") namespace = "test-namespace" vector = np.random.rand(1024) top_k = 3 include_values = True filter={ "label": {"$lt": 2} } query_data_with_filter(index, namespace, vector, top_k, include_values, filter) """ def query_data_with_filter(index, namespace, vector, top_k=3, include_values=True, filter=None): out = index.query( namespace=namespace, vector=vector.tolist(), top_k=top_k, include_values=include_values, filter=filter ) return out """ pc = Pinecone("5faec954-a6c5-4af5-a577-89dbd2e4e5b0") ids = ["UkfgLgeYW9wo", "GkkzUYYOcooB"] indexname = "ostreacultura-v1" namespace = "cards-data" index = pc.Index(indexname) DL.fetch_data(index, ids, namespace) """ def fetch_data(index, ids, namespace): out = index.fetch(ids=ids, namespace=namespace) return out def get_all_ids_from_namespace(index, namespace): ids = index.list(namespace=namespace) return ids """ ## Hybrid search weighting - Alpa is equal to the weight of the dense vector dense = [0.1, 0.2, 0.3, 0.4] sparse_vector={ 'indices': [10, 45, 16], 'values': [0.5, 0.5, 0.2] } dense, sparse = hybrid_score_norm(dense, sparse, alpha=1.0) """ def hybrid_score_norm(dense, sparse, alpha: float): """Hybrid score using a convex combination alpha * dense + (1 - alpha) * sparse Args: dense: Array of floats representing sparse: a dict of `indices` and `values` alpha: scale between 0 and 1 """ if alpha < 0 or alpha > 1: raise ValueError("Alpha must be between 0 and 1") hs = { 'indices': sparse['indices'], 'values': [v * (1 - alpha) for v in sparse['values']] } return [v * alpha for v in dense], hs #############