Spaces:
Sleeping
Sleeping
from src.helper import load_pdf, text_split, download_hugging_face_embeddings | |
import os | |
from pinecone import Pinecone, ServerlessSpec | |
# Set your Pinecone API key and environment directly in the script | |
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "cc") | |
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV", "us-east-1") | |
# Initialize Pinecone | |
pc = Pinecone(api_key=PINECONE_API_KEY) | |
# Check if the index exists, if not create it | |
index_name = "bhagavadgita" | |
if index_name not in pc.list_indexes().names(): | |
pc.create_index( | |
name=index_name, | |
dimension=384, # Replace with the actual dimension of your embeddings | |
metric='euclidean', | |
spec=ServerlessSpec( | |
cloud='aws', | |
region=PINECONE_API_ENV | |
) | |
) | |
# Connect to the index | |
index = pc.Index(index_name) | |
# Load PDF and split text | |
extracted_data = load_pdf("data/") | |
text_chunks = text_split(extracted_data) | |
embeddings = download_hugging_face_embeddings() | |
# Use the correct method to obtain embeddings | |
vectors = embeddings.embed_documents([t.page_content for t in text_chunks]) | |
ids = [f"doc_{i}" for i in range(len(text_chunks))] | |
# Split vectors into smaller batches | |
batch_size = 1000 # Adjust batch size as needed | |
for i in range(0, len(vectors), batch_size): | |
batch_ids = ids[i:i + batch_size] | |
batch_vectors = vectors[i:i + batch_size] | |
# Upsert vectors into Pinecone index | |
index.upsert(vectors=list(zip(batch_ids, batch_vectors))) | |
print("Indexing completed.") | |