Spaces:

NandanData
/

Chat_with_Krishna

Sleeping

Chat_with_Krishna / store_index.py

Update store_index.py

17dcd41 verified about 1 year ago

1.49 kB

	from src.helper import load_pdf, text_split, download_hugging_face_embeddings
	import os
	from pinecone import Pinecone, ServerlessSpec

	# Set your Pinecone API key and environment directly in the script
	PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "cc")
	PINECONE_API_ENV = os.getenv("PINECONE_API_ENV", "us-east-1")

	# Initialize Pinecone
	pc = Pinecone(api_key=PINECONE_API_KEY)

	# Check if the index exists, if not create it
	index_name = "bhagavadgita"
	if index_name not in pc.list_indexes().names():
	pc.create_index(
	name=index_name,
	dimension=384, # Replace with the actual dimension of your embeddings
	metric='euclidean',
	spec=ServerlessSpec(
	cloud='aws',
	region=PINECONE_API_ENV
	)
	)

	# Connect to the index
	index = pc.Index(index_name)

	# Load PDF and split text
	extracted_data = load_pdf("data/")
	text_chunks = text_split(extracted_data)
	embeddings = download_hugging_face_embeddings()

	# Use the correct method to obtain embeddings
	vectors = embeddings.embed_documents([t.page_content for t in text_chunks])
	ids = [f"doc_{i}" for i in range(len(text_chunks))]

	# Split vectors into smaller batches
	batch_size = 1000 # Adjust batch size as needed
	for i in range(0, len(vectors), batch_size):
	batch_ids = ids[i:i + batch_size]
	batch_vectors = vectors[i:i + batch_size]
	# Upsert vectors into Pinecone index
	index.upsert(vectors=list(zip(batch_ids, batch_vectors)))

	print("Indexing completed.")