File size: 1,493 Bytes
b4086c2
 
 
 
 
17dcd41
b4086c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from src.helper import load_pdf, text_split, download_hugging_face_embeddings
import os
from pinecone import Pinecone, ServerlessSpec

# Set your Pinecone API key and environment directly in the script
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "cc")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV", "us-east-1")

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Check if the index exists, if not create it
index_name = "bhagavadgita"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Replace with the actual dimension of your embeddings
        metric='euclidean',
        spec=ServerlessSpec(
            cloud='aws',
            region=PINECONE_API_ENV
        )
    )

# Connect to the index
index = pc.Index(index_name)

# Load PDF and split text
extracted_data = load_pdf("data/")
text_chunks = text_split(extracted_data)
embeddings = download_hugging_face_embeddings()

# Use the correct method to obtain embeddings
vectors = embeddings.embed_documents([t.page_content for t in text_chunks])
ids = [f"doc_{i}" for i in range(len(text_chunks))]

# Split vectors into smaller batches
batch_size = 1000  # Adjust batch size as needed
for i in range(0, len(vectors), batch_size):
    batch_ids = ids[i:i + batch_size]
    batch_vectors = vectors[i:i + batch_size]
    # Upsert vectors into Pinecone index
    index.upsert(vectors=list(zip(batch_ids, batch_vectors)))

print("Indexing completed.")