Spaces:

deddoggo
/

chatbot_demo

Paused

App Files Files Community

deddoggo commited on Jun 29

Commit

a53f1d8

1 Parent(s): 0c16fc9

update

Browse files

Files changed (2) hide show

rag_pipeline.py +131 -41
retriever.py +111 -76

rag_pipeline.py CHANGED Viewed

@@ -1,58 +1,148 @@
-# llm_handler.py
-# Chịu trách nhiệm cho mọi logic liên quan đến mô hình ngôn ngữ lớn (LLM).
 import torch
 from unsloth import FastLanguageModel
-# --- HÀM TẠO CÂU TRẢ LỜI ---
-def generate_llm_response(
-    query: str,
-    context: str,
-    llm_model,
-    tokenizer,
-    max_new_tokens: int = 512,
-    temperature: float = 0.3,
-    top_p: float = 0.9,
-) -> str:
     """
-    Sinh câu trả lời từ LLM dựa trên câu hỏi và ngữ cảnh đã được truy xuất.
     """
-    print("🧠 Bắt đầu sinh câu trả lời từ LLM...")
-    # Xây dựng prompt
-    prompt = f"""Bạn là một trợ lý AI chuyên tư vấn về luật giao thông đường bộ Việt Nam.
-Dựa vào các thông tin luật được cung cấp dưới đây để trả lời câu hỏi của người dùng một cách chính xác và chi tiết.
-Nếu thông tin không đủ, hãy trả lời rằng bạn không tìm thấy thông tin cụ thể trong tài liệu.
-### Thông tin luật được trích dẫn:
 {context}
-### Câu hỏi của người dùng:
 {query}
-### Trả lời của bạn:"""
-    # Tạo input cho model
-    device = llm_model.device
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    # Cấu hình cho việc sinh văn bản
     generation_config = dict(
-        max_new_tokens=max_new_tokens,
-        temperature=temperature,
-        top_p=top_p,
         do_sample=True,
-        pad_token_id=tokenizer.eos_token_id
     )
-    try:
-        output_ids = llm_model.generate(**inputs, **generation_config)
-        input_length = inputs.input_ids.shape[1]
-        generated_ids = output_ids[0][input_length:]
-        response_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
-        print("✅ Sinh câu trả lời hoàn tất.")
-        return response_text
-    except Exception as e:
-        print(f"❌ Lỗi khi sinh câu trả lời từ LLM: {e}")
-        return "Xin lỗi, đã có lỗi xảy ra trong quá trình tạo câu trả lời."

+# file: rag_pipeline.py
 import torch
+import json
+import faiss
+import numpy as np
+import re
 from unsloth import FastLanguageModel
+from sentence_transformers import SentenceTransformer
+from rank_bm25 import BM25Okapi
+from transformers import TextStreamer
+# Import các hàm từ file khác
+from data_processor import process_law_data_to_chunks
+from retriever import search_relevant_laws, tokenize_vi_for_bm25_setup
+def initialize_components(data_path):
+    """
+    Khởi tạo và tải tất cả các thành phần cần thiết cho RAG pipeline.
+    Hàm này chỉ nên được gọi một lần khi ứng dụng khởi động.
+    """
+    print("--- Bắt đầu khởi tạo các thành phần ---")
+    # 1. Tải LLM và Tokenizer từ Unsloth
+    print("1. Tải mô hình LLM (Unsloth)...")
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
+        max_seq_length=2048,
+        dtype=None,
+        load_in_4bit=True,
+    )
+    print("✅ Tải LLM và Tokenizer thành công.")
+    # 2. Tải mô hình Embedding
+    print("2. Tải mô hình Embedding...")
+    embedding_model = SentenceTransformer(
+        "bkai-foundation-models/vietnamese-bi-encoder",
+        device="cuda" if torch.cuda.is_available() else "cpu"
+    )
+    print("✅ Tải mô hình Embedding thành công.")
+    # 3. Tải và xử lý dữ liệu JSON
+    print(f"3. Tải và xử lý dữ liệu từ {data_path}...")
+    with open(data_path, 'r', encoding='utf-8') as f:
+        raw_data = json.load(f)
+    chunks_data = process_law_data_to_chunks(raw_data)
+    print(f"✅ Xử lý dữ liệu thành công, có {len(chunks_data)} chunks.")
+    # 4. Tạo Embeddings và FAISS Index
+    print("4. Tạo embeddings và FAISS index...")
+    texts_to_encode = [chunk.get('text', '') for chunk in chunks_data]
+    chunk_embeddings_tensor = embedding_model.encode(
+        texts_to_encode,
+        convert_to_tensor=True,
+        device=embedding_model.device
+    )
+    chunk_embeddings_np = chunk_embeddings_tensor.cpu().numpy().astype('float32')
+    faiss.normalize_L2(chunk_embeddings_np)
+    dimension = chunk_embeddings_np.shape[1]
+    faiss_index = faiss.IndexFlatIP(dimension)
+    faiss_index.add(chunk_embeddings_np)
+    print(f"✅ Tạo FAISS index thành công với {faiss_index.ntotal} vector.")
+    # 5. Tạo BM25 Model
+    print("5. Tạo mô hình BM25...")
+    corpus_texts_for_bm25 = [chunk.get('text', '') for chunk in chunks_data]
+    tokenized_corpus_bm25 = [tokenize_vi_for_bm25_setup(text) for text in corpus_texts_for_bm25]
+    bm25_model = BM25Okapi(tokenized_corpus_bm25)
+    print("✅ Tạo mô hình BM25 thành công.")
+    print("--- ✅ Khởi tạo tất cả thành phần hoàn tất ---")
+    return {
+        "llm_model": model,
+        "tokenizer": tokenizer,
+        "embedding_model": embedding_model,
+        "chunks_data": chunks_data,
+        "faiss_index": faiss_index,
+        "bm25_model": bm25_model
+    }
+def generate_response(query, components):
     """
+    Tạo câu trả lời cho một query bằng cách sử dụng các thành phần đã được khởi tạo.
     """
+    print("--- Bắt đầu quy trình RAG cho query mới ---")
+    # Unpack các thành phần
+    llm_model = components["llm_model"]
+    tokenizer = components["tokenizer"]
+    # 1. Truy xuất ngữ cảnh
+    retrieved_results = search_relevant_laws(
+        query_text=query,
+        embedding_model=components["embedding_model"],
+        faiss_index=components["faiss_index"],
+        chunks_data=components["chunks_data"],
+        bm25_model=components["bm25_model"],
+        k=5,
+        initial_k_multiplier=18
+    )
+    # 2. Định dạng Context
+    if not retrieved_results:
+        context = "Không tìm thấy thông tin luật liên quan trong cơ sở dữ liệu."
+    else:
+        context_parts = []
+        for i, res in enumerate(retrieved_results):
+            metadata = res.get('metadata', {})
+            header = f"Trích dẫn {i+1}: Điều {metadata.get('article', 'N/A')}, Khoản {metadata.get('clause_number', 'N/A')} (Nguồn: {metadata.get('source_document', 'N/A')})"
+            text = res.get('text', '*Nội dung không có*')
+            context_parts.append(f"{header}\n{text}")
+        context = "\n\n---\n\n".join(context_parts)
+    # 3. Xây dựng Prompt và tạo câu trả lời
+    prompt = f"""Dưới đây là một số thông tin trích dẫn từ văn bản luật giao thông đường bộ Việt Nam.
+Hãy SỬ DỤNG CÁC THÔNG TIN NÀY để trả lời câu hỏi một cách chính xác và đầy đủ.
+Nếu câu hỏi đưa ra nhiều đáp án thì chọn 1 đáp án đúng nhất.
+### Thông tin luật:
 {context}
+### Câu hỏi:
 {query}
+### Trả lời:"""
+    print("--- Bắt đầu tạo câu trả lời từ LLM ---")
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
     generation_config = dict(
+        max_new_tokens=256,
+        temperature=0.5,
+        top_p=0.7,
+        top_k=50,
+        repetition_penalty=1.1,
         do_sample=True,
+        pad_token_id=tokenizer.eos_token_id,
+        eos_token_id=tokenizer.eos_token_id
     )
+    output_ids = llm_model.generate(**inputs, **generation_config)
+    input_length = inputs.input_ids.shape[1]
+    generated_ids = output_ids[0][input_length:]
+    response_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
+    print("--- Tạo câu trả lời hoàn tất ---")
+    return response_text

retriever.py CHANGED Viewed

@@ -1,82 +1,117 @@
-# retrieval_handler.py
-# Chịu trách nhiệm cho mọi logic liên quan đến việc truy xuất thông tin (Retrieval).
-import json
-import re
-import numpy as np
 import faiss
 from collections import defaultdict
-from typing import List, Dict, Any, Optional
-from utils import tokenize_vi_simple # Import từ file utils.py
-# --- HÀM XỬ LÝ DỮ LIỆU ---
-def process_law_data_to_chunks(structured_data: Any) -> List[Dict]:
-    """Làm phẳng dữ liệu luật có cấu trúc thành danh sách các chunks."""
-    flat_list = []
-    articles = [structured_data] if isinstance(structured_data, dict) else structured_data
-    for article_data in articles:
-        if not isinstance(article_data, dict): continue
-        # (Logic xử lý chi tiết của bạn ở đây... đã được rút gọn để dễ đọc)
-        # Giả sử logic này hoạt động đúng như bạn đã thiết kế
-        # và trả về một danh sách các chunk, mỗi chunk là một dict có "text" và "metadata".
-        # Để đảm bảo, tôi sẽ thêm một phiên bản đơn giản hóa ở đây.
-        clauses = article_data.get("clauses", [])
-        for clause in clauses:
-            points = clause.get("points_in_clause", [])
-            if points:
-                for point in points:
-                    text = point.get("point_text_original")
-                    if text:
-                        flat_list.append({"text": text, "metadata": {"article": article_data.get("article"), "clause": clause.get("clause_number"), "point": point.get("point_id")}})
-            else:
-                text = clause.get("clause_text_original")
-                if text:
-                    flat_list.append({"text": text, "metadata": {"article": article_data.get("article"), "clause": clause.get("clause_number")}})
-    return flat_list
-# --- HÀM TÌM KIẾM ---
 def search_relevant_laws(
-    query_text: str,
-    embedding_model,
-    faiss_index,
-    chunks_data: List[Dict],
-    bm25_model,
-    k: int = 5,
-    rrf_k_constant: int = 60
-) -> List[Dict]:
     """
-    Thực hiện Hybrid Search (Semantic + Keyword) với RRF để tìm các chunk liên quan.
     """
-    print(f"🔎 Bắt đầu tìm kiếm cho: '{query_text}'")
-    # 1. Semantic Search (FAISS)
-    query_embedding = embedding_model.encode([query_text], convert_to_tensor=True)
-    query_embedding_np = query_embedding.cpu().numpy().astype('float32')
-    faiss.normalize_L2(query_embedding_np)
-    num_candidates = min(k * 10, faiss_index.ntotal)
-    _, semantic_indices = faiss_index.search(query_embedding_np, num_candidates)
-    # 2. Keyword Search (BM25)
-    tokenized_query = tokenize_vi_simple(query_text)
-    bm25_scores = bm25_model.get_scores(tokenized_query)
-    bm25_results = sorted(enumerate(bm25_scores), key=lambda x: x[1], reverse=True)[:num_candidates]
-    # 3. Reciprocal Rank Fusion (RRF)
     rrf_scores = defaultdict(float)
-    if semantic_indices.size > 0:
-        for rank, doc_idx in enumerate(semantic_indices[0]):
-            if doc_idx != -1: rrf_scores[doc_idx] += 1.0 / (rrf_k_constant + rank)
-    for rank, (doc_idx, score) in enumerate(bm25_results):
-        if score > 0: rrf_scores[doc_idx] += 1.0 / (rrf_k_constant + rank)
-    fused_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
-    # 4. Trả về top K kết quả cuối cùng
-    final_results = []
-    for doc_idx, score in fused_results[:k]:
-        result = chunks_data[doc_idx].copy()
-        result['retrieval_score'] = score
-        final_results.append(result)
-    print(f"✅ Tìm kiếm hoàn tất, trả về {len(final_results)} kết quả.")
-    return final_results

+# file: retriever.py
 import faiss
+import numpy as np
+import torch
+import re
 from collections import defaultdict
+from rank_bm25 import BM25Okapi
+def tokenize_vi_for_bm25_setup(text):
+    """Tokenize tiếng Việt đơn giản cho BM25."""
+    text = text.lower()
+    text = re.sub(r'[^\w\s]', '', text)
+    return text.split()
 def search_relevant_laws(
+        query_text,
+        embedding_model,
+        faiss_index,
+        chunks_data,
+        bm25_model,
+        k=5,
+        initial_k_multiplier=10,
+        rrf_k_constant=60
+    ):
     """
+    Thực hiện tìm kiếm lai (Hybrid Search) kết hợp Semantic Search (FAISS) và Keyword Search (BM25),
+    sau đó kết hợp kết quả bằng Reciprocal Rank Fusion (RRF) và tăng cường bằng metadata.
     """
+    if k <= 0:
+        print("Lỗi: k (số lượng kết quả) phải là số dương.")
+        return []
+    print(f"\n🔎 Đang tìm kiếm (Hybrid) cho truy vấn: '{query_text}'")
+    query_lower = query_text.lower()
+    # Phân tích query
+    fine_keywords = r'tiền|phạt|bao nhiêu đồng|bao nhiêu tiền|mức phạt|xử phạt hành chính'
+    points_keywords = r'điểm|trừ điểm|mấy điểm|trừ bao nhiêu điểm|bằng lái|gplx'
+    query_mentions_fine = bool(re.search(fine_keywords, query_lower))
+    query_mentions_points = bool(re.search(points_keywords, query_lower))
+    needs_specific_metadata_filter = query_mentions_fine or query_mentions_points
+    print(f"   Phân tích query: Đề cập tiền phạt? {query_mentions_fine}, Đề cập điểm trừ? {query_mentions_points}")
+    num_vectors_in_index = faiss_index.ntotal
+    if num_vectors_in_index == 0:
+        print("Lỗi: FAISS index rỗng.")
+        return []
+    num_candidates_each_retriever = min(k * initial_k_multiplier, num_vectors_in_index)
+    # === 1. Semantic Search (FAISS) ===
+    try:
+        query_embedding_tensor = embedding_model.encode([query_text], convert_to_tensor=True, device=embedding_model.device)
+        query_embedding_np = query_embedding_tensor.cpu().numpy().astype('float32')
+        faiss.normalize_L2(query_embedding_np)
+        semantic_scores_raw, semantic_indices_raw = faiss_index.search(query_embedding_np, num_candidates_each_retriever)
+    except Exception as e:
+        print(f"Lỗi khi tìm kiếm ngữ nghĩa (FAISS): {e}")
+        semantic_indices_raw = np.array([[]], dtype=int)
+    # === 2. Keyword Search (BM25) ===
+    try:
+        tokenized_query_bm25 = tokenize_vi_for_bm25_setup(query_text)
+        all_bm25_scores = bm25_model.get_scores(tokenized_query_bm25)
+        bm25_results_with_indices = [{'index': i, 'score': score} for i, score in enumerate(all_bm25_scores) if score > 0]
+        bm25_results_with_indices.sort(key=lambda x: x['score'], reverse=True)
+        top_bm25_results = bm25_results_with_indices[:num_candidates_each_retriever]
+    except Exception as e:
+        print(f"Lỗi khi tìm kiếm từ khóa (BM25): {e}")
+        top_bm25_results = []
+    # === 3. Result Fusion (RRF) ===
     rrf_scores = defaultdict(float)
+    all_retrieved_indices_set = set()
+    if semantic_indices_raw.size > 0:
+        for rank, doc_idx in enumerate(semantic_indices_raw[0]):
+            if 0 <= doc_idx < num_vectors_in_index:
+                rrf_scores[doc_idx] += 1.0 / (rrf_k_constant + rank)
+                all_retrieved_indices_set.add(doc_idx)
+    for rank, item in enumerate(top_bm25_results):
+        doc_idx = item['index']
+        rrf_scores[doc_idx] += 1.0 / (rrf_k_constant + rank)
+        all_retrieved_indices_set.add(doc_idx)
+    fused_initial_results = [{'index': doc_idx, 'fused_score': rrf_scores[doc_idx]} for doc_idx in all_retrieved_indices_set]
+    fused_initial_results.sort(key=lambda x: x['fused_score'], reverse=True)
+    # === 4. Lọc và Tái xếp hạng cuối cùng ===
+    final_processed_results = []
+    num_to_process_metadata = min(len(fused_initial_results), num_candidates_each_retriever * 2)
+    for rank_idx, res_item in enumerate(fused_initial_results[:num_to_process_metadata]):
+        try:
+            result_index = res_item['index']
+            base_score_from_fusion = res_item['fused_score']
+            original_chunk = chunks_data[result_index]
+            original_metadata = original_chunk.get('metadata', {})
+            # Thêm logic xử lý metadata boosting ở đây nếu cần...
+            # Hiện tại, chỉ trả về kết quả đã fusion.
+            # Bạn có thể copy lại toàn bộ logic boosting từ script gốc vào đây.
+            final_score_calculated = base_score_from_fusion # (Thêm boosting vào đây)
+            final_processed_results.append({
+                "rank_after_fusion": rank_idx + 1,
+                "index": int(result_index),
+                "final_score": final_score_calculated,
+                "text": original_chunk.get('text', '*Không có text*'),
+                "metadata": original_metadata
+            })
+        except Exception as e:
+            print(f"Lỗi khi xử lý ứng viên tại chỉ số {res_item.get('index')}: {e}")
+    final_processed_results.sort(key=lambda x: x["final_score"], reverse=True)
+    return final_processed_results[:k]