File size: 2,214 Bytes
d82600f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import chromadb

chroma_client = chromadb.Client()
model = None

def get_model():
    global model
    if model is None:
        try:
            from sentence_transformers import SentenceTransformer
            model = SentenceTransformer('all-MiniLM-L6-v2')
        except Exception as e:
            print(f"[ERROR] Could not load SentenceTransformer: {e}")
            model = None
    return model

COLLECTION_NAME = "documents"

# Ensure collection exists
if not chroma_client.list_collections() or COLLECTION_NAME not in [c.name for c in chroma_client.list_collections()]:
    chroma_client.create_collection(COLLECTION_NAME)
collection = chroma_client.get_collection(COLLECTION_NAME)

def add_to_vector_store(chunks, metadatas=None):
    try:
        if not chunks:
            print("[WARNING] No chunks provided to vector store")
            return
        model_instance = get_model()
        if model_instance is None:
            print("[ERROR] Embedding model not available.")
            return
        embeddings = model_instance.encode(chunks).tolist()
        ids = [f"chunk_{i}" for i in range(len(chunks))]
        collection.add(documents=chunks, embeddings=embeddings, ids=ids, metadatas=metadatas)
        print(f"[INFO] Added {len(chunks)} chunks to vector store")
    except Exception as e:
        print(f"[ERROR] Failed to add chunks to vector store: {e}")
        # Don't raise the exception to prevent the entire upload from failing

def similarity_search(query, top_k=5):
    try:
        if not query or not query.strip():
            return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
        model_instance = get_model()
        if model_instance is None:
            print("[ERROR] Embedding model not available.")
            return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
        embedding = model_instance.encode([query]).tolist()[0]
        results = collection.query(query_embeddings=[embedding], n_results=top_k)
        return results
    except Exception as e:
        print(f"[ERROR] Similarity search failed: {e}")
        # Return empty results instead of failing
        return {"documents": [[]], "metadatas": [[]], "distances": [[]]}