File size: 3,165 Bytes
0930d33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import sys
src_directory = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..", "src"))
sys.path.append(src_directory)
from pinecone import Pinecone, ServerlessSpec
import time
from model.clip_model import ClipModel
from data import request_images
from data import data_set
from config import config
from utils import logger

config = config.load_config()
logger = logger.get_logger()

clip_model = ClipModel()

def create_index(pinecone, index_name):
    pinecone.create_index(
    name=index_name,
    dimension=512,
    metric="cosine",
    spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
        ) 
    )

def wait_till_index_loaded(pinecone, index_name):
    while True:
        index = pinecone.describe_index(index_name)
        if index.status.get("ready", False):
            index = pinecone.Index(index_name)
            logger.info(f"Index '{index_name}' is ready and is now accessible.")
            return index
        else:
            logger.debug(f"Index '{index_name}' is not ready yet. Checking again in 1 second.")
            time.sleep(1)

def get_index():
    try:
        pincone_api_key = config['pinecone_db']['pincone_api_key']
        pc = Pinecone(api_key=pincone_api_key)
        index = None
        index_name = "imagesearch"
        logger.info(f"Checking if the index '{index_name}' exists...")
        if not pc.has_index(index_name):
            logger.info(f"Index '{index_name}' does not exist. Creating a new index...")
            create_index(pc,index_name)
            logger.info(f"Index '{index_name}' creation initiated. Waiting for it to be ready...")
            index = wait_till_index_loaded(index_name,pc)
        else:
            index = pc.Index(index_name)
            logger.info(f"Index '{index_name}' already exists. Returning the existing index.")
        return index
    except Exception as e:
        logger.info(f"Error occurred while getting or creating the Pinecone index: {str(e)}", exc_info=True)
        return index
    
def upsert_data(index,embeddings,id,url):
    try :
        logger.info("Started to upsert the data")
        index.upsert(
            vectors=[{
                "id": id,
                "values": embeddings,
                "metadata": {
                "url": url,
                "photo_id": id
                }
            }],
            namespace="image-search-dataset",
        )
        logger.info(f"Successfully upserted the data in database")
    except Exception as e:
        logger.info(f"Unable to upsert the data {e}")
        raise
    
def add_data_to_database(df):
    try:
        index = get_index()
        logger.info("Starting to add the embeddings to the database")
        for _, data in df.iterrows():
            url = data['photo_image_url']
            id = data['photo_id']
            embeddings = clip_model.get_image_embedding(url)
            upsert_data(index,embeddings,id,url)
        logger.info("Added embeddings to the database successfully")
    except Exception as e:
        logger.info("Unable to add the data. Error : {e}")


# df = data_set.get_df(8000,8500)     
# add_data_to_database(df)