File size: 3,760 Bytes
e6bfb30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
707fd94
e6bfb30
 
e82a347
e6bfb30
 
 
 
 
 
 
 
 
 
 
 
e82a347
 
 
 
 
e6bfb30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import json
import os
from pinecone import Pinecone, ServerlessSpec
import numpy as np
from openai import OpenAI

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Get API keys from environment variables
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY environment variable not set")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY environment variable not set")

# Initialize OpenAI client
openai_client = OpenAI(api_key=OPENAI_API_KEY)

# Define the embedding model using OpenAI
class OpenAIEmbedder:
    def __init__(self, model_name="text-embedding-3-small"):
        self.model_name = model_name
        self.client = openai_client
        self.embedding_dimension = 1536  # Dimension of text-embedding-3-small

    def encode(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        
        # Get embeddings from OpenAI
        response = self.client.embeddings.create(
            input=texts,
            model=self.model_name
        )
        
        # Extract embeddings from response
        embeddings = [item.embedding for item in response.data]
        return np.array(embeddings)

# Initialize Pinecone client
def initialize_pinecone():
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    # Define index name
    index_name = "ebikes-search"
    
    # Check if index already exists
    existing_indexes = pc.list_indexes().names()
    if index_name not in existing_indexes:
        # Create index with 1536 dimensions (matches text-embedding-3-small)
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-west-2")
        )
        print(f"Created new index: {index_name}")
    
    # Connect to the index
    index = pc.Index(index_name)
    return index

# Load the e-bikes data
def load_ebikes_data(file_path="data.json"):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data.get('pogo-cycles-data', [])

# Create embeddings and upload to Pinecone
def create_and_upload_embeddings(ebikes_data, encoder, pinecone_index):
    # Prepare data for indexing
    ids = []
    descriptions = []
    metadata = []
    
    for bike in ebikes_data:
        ids.append(bike['id'])
        descriptions.append(bike['description'])
        metadata.append({
            "id": bike["id"],
            "name": bike["name"],
            "product_type": bike["type"],       # or "escooter"
            "category": bike["category"],        # mountain / folding / cargo ...
            "description": bike["description"]
        })
    
    # Create embeddings
    embeddings = encoder.encode(descriptions)
    
    # Prepare vectors for Pinecone
    vectors_to_upsert = []
    for i in range(len(ids)):
        vector = {
            'id': ids[i],
            'values': embeddings[i].tolist(),
            'metadata': metadata[i]
        }
        vectors_to_upsert.append(vector)
    
    # Upsert vectors to Pinecone
    pinecone_index.upsert(vectors=vectors_to_upsert)
    print(f"Uploaded {len(vectors_to_upsert)} embeddings to Pinecone")

# Main function to run the embedding creation process
def main():
    # Initialize the embedding model
    encoder = OpenAIEmbedder()
    
    # Initialize Pinecone
    pinecone_index = initialize_pinecone()
    
    # Load ebikes data
    ebikes_data = load_ebikes_data()
    
    # Create and upload embeddings
    create_and_upload_embeddings(ebikes_data, encoder, pinecone_index)
    
    print("Embedding creation and upload completed successfully!")

if __name__ == "__main__":
    main()