File size: 2,384 Bytes
fd22419
 
 
0a72b65
 
fd22419
 
 
 
 
0787084
d95c7be
fd22419
5840faa
72ee423
 
d95c7be
fd22419
 
 
 
 
9c4d944
72ee423
9ecc246
 
 
836c9f1
72ee423
9c4d944
2d91f2a
03cd284
 
2d91f2a
03cd284
 
 
 
 
9c4d944
03cd284
 
 
 
 
d6edb5a
 
 
 
fd22419
 
0a72b65
9082445
0a72b65
9082445
fd22419
32e14d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Import từ helpers
from helpers import (
    list_docx_files,  # Lấy danh sách file .docx
    get_splits,  # Xử lý file docx thành splits
    get_json_splits_only,  # Xử lý file JSON (FAQ)  
    get_web_documents,  # Xử lý dữ liệu từ web
)

import json

def get_vectorstore():
    ### Xử lý tất cả các tài liệu và nhét vào database
    folder_path = "syllabus_nct_word_format/"
    docx_files = list_docx_files(folder_path)
    
    all_splits = []  # Khởi tạo danh sách lưu kết quả
    print("Feeding relevent websites' contents")
    # 
    with open('syllabus_nct_word_format/urls.txt', 'r') as f:
        base_urls = [line.strip() for line in f]
    # urls_list
    # base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
    # ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
    
    
    website_contents = get_web_documents(base_urls=base_urls)
    all_splits += website_contents
    
    print('Feeding .docx files')
    for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
        output_json_path = f"output_{i}.json"
        splits = get_splits(file_path, output_json_path)
        all_splits += splits
        
    print('Feeding .json files')
    # Xử lý FAQ
    FAQ_path = "syllabus_nct_word_format/FAQ.json"
    FAQ_splits = get_json_splits_only(FAQ_path)
    all_splits += FAQ_splits

    FAQ_path = "syllabus_nct_word_format/FAQ2.json"
    FAQ_splits = get_json_splits_only(FAQ_path)
    all_splits += FAQ_splits
    
    # Lưu vào vectorstore với nhúng từ Google GenAI
    # embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
    print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    print('Set vectorstore FAISS')
    vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
    return vectorstore