quoc-khanh commited on
Commit
d644065
·
verified ·
1 Parent(s): 3d419e7

Update file_loader.py

Browse files
Files changed (1) hide show
  1. file_loader.py +10 -2
file_loader.py CHANGED
@@ -60,7 +60,7 @@ def get_vectorstore():
60
  # print('Vectorstore ready!')
61
  # return vectorstore
62
 
63
- folder_path = "syllabus_nct_word_format/" #'/content/chatbot4nct_test2/syllabus_nct_word_format'
64
  docx_files = list_docx_files(folder_path)
65
  all_splits = [] # Khởi tạo danh sách lưu kết quả
66
  for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
@@ -69,7 +69,15 @@ def get_vectorstore():
69
  splits = get_splits(file_path, output_json_path)
70
  splits_with_metadata = update_documents_metadata(splits, metadata)
71
  all_splits += splits_with_metadata
72
- if i == 1: break
 
 
 
 
 
 
 
 
73
 
74
  print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
75
  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") #"VoVanPhuc/sup-SimCSE-VietNamese-phobert-base")
 
60
  # print('Vectorstore ready!')
61
  # return vectorstore
62
 
63
+ folder_path = "syllabus_nct_pdf_format/" #'/content/chatbot4nct_test2/syllabus_nct_word_format'
64
  docx_files = list_docx_files(folder_path)
65
  all_splits = [] # Khởi tạo danh sách lưu kết quả
66
  for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
 
69
  splits = get_splits(file_path, output_json_path)
70
  splits_with_metadata = update_documents_metadata(splits, metadata)
71
  all_splits += splits_with_metadata
72
+ # if i == 1: break
73
+
74
+ FAQ_path = "syllabus_nct_word_format/FAQ.json"
75
+ FAQ_splits = get_json_splits_only(FAQ_path)
76
+ all_splits += FAQ_splits
77
+
78
+ FAQ_path = "syllabus_nct_word_format/FAQ2.json"
79
+ FAQ_splits = get_json_splits_only(FAQ_path)
80
+ all_splits += FAQ_splits
81
 
82
  print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
83
  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") #"VoVanPhuc/sup-SimCSE-VietNamese-phobert-base")