Spaces:
Sleeping
Sleeping
File size: 1,493 Bytes
b4086c2 17dcd41 b4086c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from src.helper import load_pdf, text_split, download_hugging_face_embeddings
import os
from pinecone import Pinecone, ServerlessSpec
# Set your Pinecone API key and environment directly in the script
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "cc")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV", "us-east-1")
# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
# Check if the index exists, if not create it
index_name = "bhagavadgita"
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=384, # Replace with the actual dimension of your embeddings
metric='euclidean',
spec=ServerlessSpec(
cloud='aws',
region=PINECONE_API_ENV
)
)
# Connect to the index
index = pc.Index(index_name)
# Load PDF and split text
extracted_data = load_pdf("data/")
text_chunks = text_split(extracted_data)
embeddings = download_hugging_face_embeddings()
# Use the correct method to obtain embeddings
vectors = embeddings.embed_documents([t.page_content for t in text_chunks])
ids = [f"doc_{i}" for i in range(len(text_chunks))]
# Split vectors into smaller batches
batch_size = 1000 # Adjust batch size as needed
for i in range(0, len(vectors), batch_size):
batch_ids = ids[i:i + batch_size]
batch_vectors = vectors[i:i + batch_size]
# Upsert vectors into Pinecone index
index.upsert(vectors=list(zip(batch_ids, batch_vectors)))
print("Indexing completed.")
|