import json import os from pinecone import Pinecone, ServerlessSpec import numpy as np from openai import OpenAI # Load environment variables from dotenv import load_dotenv load_dotenv() # Get API keys from environment variables PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') if not PINECONE_API_KEY: raise ValueError("PINECONE_API_KEY environment variable not set") if not OPENAI_API_KEY: raise ValueError("OPENAI_API_KEY environment variable not set") # Initialize OpenAI client openai_client = OpenAI(api_key=OPENAI_API_KEY) # Define the embedding model using OpenAI class OpenAIEmbedder: def __init__(self, model_name="text-embedding-3-small"): self.model_name = model_name self.client = openai_client self.embedding_dimension = 1536 # Dimension of text-embedding-3-small def encode(self, texts): if isinstance(texts, str): texts = [texts] # Get embeddings from OpenAI response = self.client.embeddings.create( input=texts, model=self.model_name ) # Extract embeddings from response embeddings = [item.embedding for item in response.data] return np.array(embeddings) # Initialize Pinecone client def initialize_pinecone(): pc = Pinecone(api_key=PINECONE_API_KEY) # Define index name index_name = "ebikes-search" # Check if index already exists existing_indexes = pc.list_indexes().names() if index_name not in existing_indexes: # Create index with 1536 dimensions (matches text-embedding-3-small) pc.create_index( name=index_name, dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-west-2") ) print(f"Created new index: {index_name}") # Connect to the index index = pc.Index(index_name) return index # Load the e-bikes data def load_ebikes_data(file_path="data.json"): with open(file_path, 'r') as f: data = json.load(f) return data.get('pogo-cycles-data', []) # Create embeddings and upload to Pinecone def create_and_upload_embeddings(ebikes_data, encoder, pinecone_index): # Prepare data for indexing ids = [] descriptions = [] metadata = [] for bike in ebikes_data: ids.append(bike['id']) descriptions.append(bike['description']) metadata.append({ "id": bike["id"], "name": bike["name"], "product_type": bike["type"], # or "escooter" "category": bike["category"], # mountain / folding / cargo ... "description": bike["description"] }) # Create embeddings embeddings = encoder.encode(descriptions) # Prepare vectors for Pinecone vectors_to_upsert = [] for i in range(len(ids)): vector = { 'id': ids[i], 'values': embeddings[i].tolist(), 'metadata': metadata[i] } vectors_to_upsert.append(vector) # Upsert vectors to Pinecone pinecone_index.upsert(vectors=vectors_to_upsert) print(f"Uploaded {len(vectors_to_upsert)} embeddings to Pinecone") # Main function to run the embedding creation process def main(): # Initialize the embedding model encoder = OpenAIEmbedder() # Initialize Pinecone pinecone_index = initialize_pinecone() # Load ebikes data ebikes_data = load_ebikes_data() # Create and upload embeddings create_and_upload_embeddings(ebikes_data, encoder, pinecone_index) print("Embedding creation and upload completed successfully!") if __name__ == "__main__": main()