uttertuple / embed_data.py
Omkar192002's picture
Update embed_data.py
707fd94 verified
import json
import os
from pinecone import Pinecone, ServerlessSpec
import numpy as np
from openai import OpenAI
# Load environment variables
from dotenv import load_dotenv
load_dotenv()
# Get API keys from environment variables
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not PINECONE_API_KEY:
raise ValueError("PINECONE_API_KEY environment variable not set")
if not OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY environment variable not set")
# Initialize OpenAI client
openai_client = OpenAI(api_key=OPENAI_API_KEY)
# Define the embedding model using OpenAI
class OpenAIEmbedder:
def __init__(self, model_name="text-embedding-3-small"):
self.model_name = model_name
self.client = openai_client
self.embedding_dimension = 1536 # Dimension of text-embedding-3-small
def encode(self, texts):
if isinstance(texts, str):
texts = [texts]
# Get embeddings from OpenAI
response = self.client.embeddings.create(
input=texts,
model=self.model_name
)
# Extract embeddings from response
embeddings = [item.embedding for item in response.data]
return np.array(embeddings)
# Initialize Pinecone client
def initialize_pinecone():
pc = Pinecone(api_key=PINECONE_API_KEY)
# Define index name
index_name = "ebikes-search"
# Check if index already exists
existing_indexes = pc.list_indexes().names()
if index_name not in existing_indexes:
# Create index with 1536 dimensions (matches text-embedding-3-small)
pc.create_index(
name=index_name,
dimension=1536,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-west-2")
)
print(f"Created new index: {index_name}")
# Connect to the index
index = pc.Index(index_name)
return index
# Load the e-bikes data
def load_ebikes_data(file_path="data.json"):
with open(file_path, 'r') as f:
data = json.load(f)
return data.get('pogo-cycles-data', [])
# Create embeddings and upload to Pinecone
def create_and_upload_embeddings(ebikes_data, encoder, pinecone_index):
# Prepare data for indexing
ids = []
descriptions = []
metadata = []
for bike in ebikes_data:
ids.append(bike['id'])
descriptions.append(bike['description'])
metadata.append({
"id": bike["id"],
"name": bike["name"],
"product_type": bike["type"], # or "escooter"
"category": bike["category"], # mountain / folding / cargo ...
"description": bike["description"]
})
# Create embeddings
embeddings = encoder.encode(descriptions)
# Prepare vectors for Pinecone
vectors_to_upsert = []
for i in range(len(ids)):
vector = {
'id': ids[i],
'values': embeddings[i].tolist(),
'metadata': metadata[i]
}
vectors_to_upsert.append(vector)
# Upsert vectors to Pinecone
pinecone_index.upsert(vectors=vectors_to_upsert)
print(f"Uploaded {len(vectors_to_upsert)} embeddings to Pinecone")
# Main function to run the embedding creation process
def main():
# Initialize the embedding model
encoder = OpenAIEmbedder()
# Initialize Pinecone
pinecone_index = initialize_pinecone()
# Load ebikes data
ebikes_data = load_ebikes_data()
# Create and upload embeddings
create_and_upload_embeddings(ebikes_data, encoder, pinecone_index)
print("Embedding creation and upload completed successfully!")
if __name__ == "__main__":
main()