baderanas commited on
Commit
deca715
·
verified ·
1 Parent(s): ffc324a

Update chroma_operations/retrieve.py

Browse files
Files changed (1) hide show
  1. chroma_operations/retrieve.py +49 -49
chroma_operations/retrieve.py CHANGED
@@ -1,49 +1,49 @@
1
- import os
2
- import logging
3
- from typing import List, Optional
4
- import chromadb
5
- from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
6
- from dotenv import load_dotenv
7
-
8
- # Load environment variables
9
- load_dotenv()
10
-
11
- # Setup logging
12
- logging.basicConfig(level=logging.INFO)
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- def search_similar_chunks(
17
- query_text: str,
18
- document_name: str,
19
- collection_name: str = "rag_collection",
20
- top_k: int = 5,
21
- ):
22
- """Search for top-k chunks similar to query_text within a specific document (source_file)."""
23
- try:
24
- # Initialize embedding function and Chroma client
25
- embedding_function = OpenAIEmbeddingFunction(
26
- api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
27
- )
28
- client = chromadb.PersistentClient(path="./chroma_db")
29
-
30
- # Load the collection
31
- collection = client.get_collection(
32
- name=collection_name, embedding_function=embedding_function
33
- )
34
-
35
- # Query similar documents filtered by document_name
36
- results = collection.query(
37
- query_texts=[query_text],
38
- n_results=top_k,
39
- where={"source_file": document_name},
40
- )
41
-
42
- documents = results.get("documents", [[]])[0]
43
- metadatas = results.get("metadatas", [[]])[0]
44
-
45
- return documents
46
-
47
- except Exception as e:
48
- logger.error(f"Similarity search failed: {str(e)}")
49
- return []
 
1
+ import os
2
+ import logging
3
+ from typing import List, Optional
4
+ import chromadb
5
+ from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ # Setup logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def search_similar_chunks(
17
+ query_text: str,
18
+ document_names: List[str],
19
+ collection_name: str = "rag_collection",
20
+ top_k: int = 5,
21
+ ):
22
+ """Search for top-k chunks similar to query_text within a specific document (source_file)."""
23
+ try:
24
+ # Initialize embedding function and Chroma client
25
+ embedding_function = OpenAIEmbeddingFunction(
26
+ api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
27
+ )
28
+ client = chromadb.PersistentClient(path="./chroma_db")
29
+
30
+ # Load the collection
31
+ collection = client.get_collection(
32
+ name=collection_name, embedding_function=embedding_function
33
+ )
34
+
35
+ # Query similar documents filtered by document_name
36
+ results = collection.query(
37
+ query_texts=[query_text],
38
+ n_results=top_k,
39
+ where={"source_file": {"$in": document_names}},
40
+ )
41
+
42
+ documents = results.get("documents", [[]])[0]
43
+ metadatas = results.get("metadatas", [[]])[0]
44
+
45
+ return documents
46
+
47
+ except Exception as e:
48
+ logger.error(f"Similarity search failed: {str(e)}")
49
+ return []