Chat_with_Krishna / store_index.py
NandanData's picture
Update store_index.py
17dcd41 verified
from src.helper import load_pdf, text_split, download_hugging_face_embeddings
import os
from pinecone import Pinecone, ServerlessSpec
# Set your Pinecone API key and environment directly in the script
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "cc")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV", "us-east-1")
# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
# Check if the index exists, if not create it
index_name = "bhagavadgita"
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=384, # Replace with the actual dimension of your embeddings
metric='euclidean',
spec=ServerlessSpec(
cloud='aws',
region=PINECONE_API_ENV
)
)
# Connect to the index
index = pc.Index(index_name)
# Load PDF and split text
extracted_data = load_pdf("data/")
text_chunks = text_split(extracted_data)
embeddings = download_hugging_face_embeddings()
# Use the correct method to obtain embeddings
vectors = embeddings.embed_documents([t.page_content for t in text_chunks])
ids = [f"doc_{i}" for i in range(len(text_chunks))]
# Split vectors into smaller batches
batch_size = 1000 # Adjust batch size as needed
for i in range(0, len(vectors), batch_size):
batch_ids = ids[i:i + batch_size]
batch_vectors = vectors[i:i + batch_size]
# Upsert vectors into Pinecone index
index.upsert(vectors=list(zip(batch_ids, batch_vectors)))
print("Indexing completed.")