Spaces:

hoangchihien3011
/

Demo-RAG

Sleeping

App Files Files Community

hoangchihien3011 commited on 23 days ago

Commit

3da6db4

verified ·

1 Parent(s): 4550eaf

Update src/rag.py

Browse files

Files changed (1) hide show

src/rag.py +94 -95

src/rag.py CHANGED Viewed

@@ -66,147 +66,146 @@ def get_embedding(text: str) -> list[float]:
     embedding = embedding_model.encode(text).tolist()
     return embedding
-def find_similar_documents_hybrid_search(
-    query_vector: list[float],
     search_query: str,
     limit: int = 10,
     candidates: int = 20,
-    vector_search_index: str = "embedding_search",
     atlas_search_index: str = "header_text"
-) -> list[dict]:
     """
-    Hybrid search combining vector and text search with parallel execution.
     """
     all_results = []
     collection = load_mongo_collection()
-    def perform_vector_search():
-        """Perform vector search in parallel."""
         try:
             vector_pipeline = [
-                {
-                    "$vectorSearch": {
-                        "index": vector_search_index,
-                        "path": "embedding",
-                        "queryVector": query_vector,
-                        "limit": limit,
-                        "numCandidates": candidates
-                    }
-                },
-                {
-                    "$project": {
-                        '_id': 1,
-                        'header' : 1,
-                        'content': 1,
-                        "vector_score": {"$meta": "vectorSearchScore"}
-                    }
-                }
             ]
             vector_results = list(collection.aggregate(vector_pipeline))
-            safe_log_info(f"Vector search returned {len(vector_results)} results")
             for doc in vector_results:
-                doc['search_type'] = 'vector'
-                doc['combined_score'] = doc.get('vector_score', 0) * 0.6  # Weight vector score
             return vector_results
         except Exception as e:
-           safe_log_warning(f"Vector search failed: {e}")
-           return []
-    def perform_text_search():
-        """Perform text search in parallel."""
         if not search_query or not search_query.strip():
             return []
         try:
             text_pipeline = [
-                {
-                    "$search": {
-                        "index": atlas_search_index,
-                        "compound": {
-                            "must": [
-                                {
-                                    "text": {
-                                        "query": search_query,
-                                        "path": ["header", "content"]
-                                    }
-                                }
-                            ]
-                        }
-                    }
-                },
-                {
-                    "$project": {
-                        '_id': 1,
-                        'header': 1,
-                        'content': 1,
-                        "text_score": {"$meta": "searchScore"}
                     }
-                }
             ]
             text_results = list(collection.aggregate(text_pipeline))
-            safe_log_info(f"Text search returned {len(text_results)} results")
             for doc in text_results:
-                doc['search_type'] = 'text'
-                doc['combined_score'] = doc.get('text_score', 0) * 0.4  # Weight text score
             return text_results
         except Exception as e:
-            safe_log_warning(f"Text search failed: {e}")
             return []
     try:
-        # Run both searches in parallel
         start_time = time.time()
         with ThreadPoolExecutor(max_workers=2) as executor:
-            vector_future = executor.submit(perform_vector_search)
-            text_future = executor.submit(perform_text_search)
-            # Collect results as they complete
-            for future in as_completed([vector_future, text_future]):
                 try:
                     results = future.result()
                     all_results.extend(results)
                 except Exception as e:
-                    safe_log_error(f"Error in parallel search: {e}")
         search_time = time.time() - start_time
-        safe_log_info(f"Parallel search completed in {search_time:.3f}s")
-        # 3. Merge và deduplicate results
-        seen_ids = set()
-        merged_results = []
         for doc in all_results:
-            doc_id = str(doc['_id'])
-            if doc_id not in seen_ids:
-                seen_ids.add(doc_id)
-                # Clean up the document for final result
-                final_doc = {
-                    '_id': doc['_id'],
-                    'content': doc.get('content', ''),
-                    # 'uploader_username': doc.get('uploader_username', ''), # Removed
-                    'header': doc.get('header', ''),
-                    'score': doc.get('combined_score', 0)
-                }
-                merged_results.append(final_doc)
             else:
-                # If document already exists, boost its score
-                for existing_doc in merged_results:
-                    if str(existing_doc['_id']) == doc_id:
-                        existing_doc['score'] += doc.get('combined_score', 0) * 0.5
-                        break
-        # Sort by combined score
-        merged_results.sort(key=lambda x: x.get('score', 0), reverse=True)
-        # Return top results
         final_results = merged_results[:limit]
-        safe_log_info(f"Hybrid search final results: {len(final_results)} documents")
-        return final_results
     except Exception as e:
-        safe_log_error(f"Error in hybrid search: {e}", exc_info=True)
 def rerank_documents(query: str, documents: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """

     embedding = embedding_model.encode(text).tolist()
     return embedding
+def find_similar_documents_hybrid_search(
+    query_vector: List[float],
     search_query: str,
     limit: int = 10,
     candidates: int = 20,
+    vector_search_index: str = "embedding_search",
     atlas_search_index: str = "header_text"
+) -> List[Dict[str, Any]]:
     """
+    Thực hiện tìm kiếm hybrid kết hợp vector search và text search, chạy song song.
+    Bao gồm cơ chế fallback nếu tìm kiếm hybrid thất bại.
     """
     all_results = []
     collection = load_mongo_collection()
+    # Hàm con cho vector search
+    def perform_vector_search() -> list:
         try:
             vector_pipeline = [
+                {"$vectorSearch": {
+                    "index": vector_search_index,
+                    "path": "embedding",
+                    "queryVector": query_vector,
+                    "limit": limit,
+                    "numCandidates": candidates
+                }},
+                {"$project": {
+                    '_id': 1, 'header': 1, 'content': 1, 'uuid': 1,
+                    "vector_score": {"$meta": "vectorSearchScore"}
+                }}
             ]
             vector_results = list(collection.aggregate(vector_pipeline))
+            safe_log_info(f"Vector search trả về {len(vector_results)} kết quả")
             for doc in vector_results:
+                # Gán trọng số 0.7 cho điểm vector
+                doc['combined_score'] = doc.get('vector_score', 0) * 0.6
             return vector_results
         except Exception as e:
+            safe_log_warning(f"Vector search thất bại: {e}")
+            return []
+    # Hàm con cho text search
+    def perform_text_search() -> list:
         if not search_query or not search_query.strip():
             return []
         try:
             text_pipeline = [
+                {"$search": {
+                    "index": atlas_search_index,
+                    "text": { # Đơn giản hóa từ compound sang text nếu chỉ có một điều kiện
+                        "query": search_query,
+                        "path": ["header", "content"] # Thêm keywords vào path
                     }
+                }},
+                {"$project": {
+                    '_id': 1, 'header': 1, 'content': 1, 'uuid': 1, 'keywords': 1,
+                    "text_score": {"$meta": "searchScore"}
+                }}
             ]
             text_results = list(collection.aggregate(text_pipeline))
+            safe_log_info(f"Text search trả về {len(text_results)} kết quả")
             for doc in text_results:
+                # Gán trọng số 0.3 cho điểm text search
+                doc['combined_score'] = doc.get('text_score', 0) * 0.4
             return text_results
         except Exception as e:
+            safe_log_warning(f"Text search thất bại: {e}")
             return []
     try:
+        # 1. Chạy song song hai truy vấn
         start_time = time.time()
         with ThreadPoolExecutor(max_workers=2) as executor:
+            future_to_search = {
+                executor.submit(perform_vector_search): "vector",
+                executor.submit(perform_text_search): "text"
+            }
+            for future in as_completed(future_to_search):
                 try:
                     results = future.result()
                     all_results.extend(results)
                 except Exception as e:
+                    safe_log_error(f"Lỗi trong quá trình tìm kiếm song song: {e}")
         search_time = time.time() - start_time
+        safe_log_info(f"Tìm kiếm song song hoàn tất trong {search_time:.3f}s")
+        # 2. Hợp nhất và loại bỏ trùng lặp (Tối ưu hóa)
+        merged_map = {}
         for doc in all_results:
+            doc_id = doc['_id']
+            if doc_id not in merged_map:
+                merged_map[doc_id] = doc
             else:
+                # Nếu tài liệu đã tồn tại, cộng dồn điểm số
+                merged_map[doc_id]['combined_score'] += doc['combined_score']
+        # Chuyển map thành list
+        merged_results = list(merged_map.values())
+        # 3. Sắp xếp theo điểm số tổng hợp
+        merged_results.sort(key=lambda x: x.get('combined_score', 0), reverse=True)
         final_results = merged_results[:limit]
+        safe_log_info(f"Tìm kiếm hybrid trả về: {len(final_results)} tài liệu")
+        return [{
+            '_id': r['_id'],
+            'header': r.get('header', ''),
+            'content': r.get('content', ''),
+            'uuid': r.get('uuid', ''),
+            'score': r.get('combined_score', 0)
+        } for r in final_results]
     except Exception as e:
+        safe_log_error(f"Lỗi nghiêm trọng trong hàm hybrid search: {e}", exc_info=True)
+        # ----- PHẦN FALLBACK ĐÃ SỬA -----
+        safe_log_warning("Thực hiện fallback: chỉ tìm kiếm bằng Text Search.")
+        try:
+            # Thực hiện lại một truy vấn text search đơn giản
+            fallback_pipeline = [
+                {"$search": {
+                    "index": atlas_search_index,
+                    "text": {
+                        "query": search_query,
+                        "path": ["header", "content", "keywords"]
+                    }
+                }},
+                {"$project": {
+                    '_id': 1, 'header': 1, 'content': 1, 'uuid': 1,
+                    'score': {"$meta": "searchScore"}
+                }},
+                {"$limit": limit}
+            ]
+            fallback_results = list(collection.aggregate(fallback_pipeline))
+            safe_log_info(f"Fallback search trả về {len(fallback_results)} kết quả.")
+            return fallback_results
+        except Exception as fallback_e:
+            safe_log_error(f"Fallback search cũng thất bại: {fallback_e}", exc_info=True)
+            return [] # Trả về list rỗng nếu cả fallback cũng lỗi
 def rerank_documents(query: str, documents: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """