Spaces:
Sleeping
Sleeping
File size: 7,260 Bytes
4993b07 2d1a321 4993b07 5ad665c 4993b07 b618a75 21c3c55 b618a75 4993b07 f94f8e4 4993b07 1d07d89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
from docx import Document
import json
import datetime
import tempfile
from pathlib import Path
from unidecode import unidecode
from langchain_community.document_loaders import JSONLoader, UnstructuredWordDocumentLoader, WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
import google.generativeai as genai
from tqdm import tqdm
from pathlib import Path
import shutil
import requests
from bs4 import BeautifulSoup
import os
# from file_loader import get_vectorstore
if "GOOGLE_API_KEY" not in os.environ:
os.environ["GOOGLE_API_KEY"] = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
async def get_urls_splits(url='https://nct.neu.edu.vn/', char='https://nct.neu.edu.vn/'):
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a', href=True): # Chỉ lấy thẻ có 'href'
href = link.get('href')
if href.startswith(char):
urls.append(href)
return urls
# docs = []
# for page_url in url:
# loader = WebBaseLoader(web_paths=[page_url])
# async for doc in loader.alazy_load():
# docs.append(doc)
# assert len(docs) == 1
# # doc = docs[0]
# return docs
# Ví dụ sử dụng
# nct_urls = get_nct_urls('https://nct.neu.edu.vn/')
# print(nct_urls)
def log_message(messages, filename="chat_log.txt"):
"""Ghi lịch sử tin nhắn vào file log"""
with open(filename, "a", encoding="utf-8") as f:
log_entry = {
"timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"conversation": messages
}
f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
def remove_tables_from_docx(file_path):
"""Tạo bản sao của file DOCX nhưng loại bỏ tất cả bảng bên trong."""
doc = Document(file_path)
new_doc = Document()
for para in doc.paragraphs:
new_doc.add_paragraph(para.text)
# 📌 Lưu vào file tạm, đảm bảo đóng đúng cách
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file:
temp_path = temp_file.name
new_doc.save(temp_path)
return temp_path # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
def load_text_data(file_path):
"""Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
cleaned_file = remove_tables_from_docx(file_path)
return UnstructuredWordDocumentLoader(cleaned_file).load()
def extract_tables_from_docx(file_path):
doc = Document(file_path)
tables = []
all_paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()] # Lấy tất cả đoạn văn bản không rỗng
table_index = 0
para_index = 0
table_positions = []
# Xác định vị trí của bảng trong tài liệu
for element in doc.element.body:
if element.tag.endswith("tbl"):
table_positions.append((table_index, para_index))
table_index += 1
elif element.tag.endswith("p"):
para_index += 1
for idx, (table_idx, para_idx) in enumerate(table_positions):
data = []
for row in doc.tables[table_idx].rows:
data.append([cell.text.strip() for cell in row.cells])
if len(data) > 1: # Chỉ lấy bảng có dữ liệu
# Lấy 5 dòng trước và sau bảng
related_start = max(0, para_idx - 5)
related_end = min(len(all_paragraphs), para_idx + 5)
related_text = all_paragraphs[related_start:related_end]
tables.append({"table": idx + 1, "content": data, "related": related_text})
return tables
def convert_to_json(tables):
structured_data = {}
for table in tables:
headers = [unidecode(h) for h in table["content"][0]] # Bỏ dấu ở headers
rows = [[unidecode(cell) for cell in row] for row in table["content"][1:]] # Bỏ dấu ở dữ liệu
json_table = [dict(zip(headers, row)) for row in rows if len(row) == len(headers)]
related_text = [unidecode(text) for text in table["related"]] # Bỏ dấu ở văn bản liên quan
structured_data[table["table"]] = {
"content": json_table,
"related": related_text
}
return json.dumps(structured_data, indent=4, ensure_ascii=False)
def save_json_to_file(json_data, output_path):
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(json.loads(json_data), f, ensure_ascii=False, indent=4)
# def load_json_with_langchain(json_path):
# loader = JSONLoader(file_path=json_path, jq_schema='.. | .content?', text_content=False)
# data = loader.load()
# # # Kiểm tra xem dữ liệu có bị lỗi không
# # print("Sample Data:", data[:2]) # In thử 2 dòng đầu
# return data
def load_json_manually(json_path):
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def load_table_data(file_path, output_json_path):
tables = extract_tables_from_docx(file_path)
json_output = convert_to_json(tables)
save_json_to_file(json_output, output_json_path)
table_data = load_json_manually(output_json_path)
return table_data
def get_splits(file_path, output_json_path):
table_data = load_table_data(file_path, output_json_path)
text_data = load_text_data(file_path)
# Chia nhỏ văn bản
json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
table_splits = json_splitter.create_documents(texts=[table_data])
text_splits = text_splitter.split_documents(text_data)
all_splits = table_splits + text_splits
return all_splits
def get_json_splits_only(file_path):
table_data = load_json_manually(file_path)
def remove_accents(obj): #xoa dau tieng viet
if isinstance(obj, str):
return unidecode(obj)
elif isinstance(obj, list):
return [remove_accents(item) for item in obj]
elif isinstance(obj, dict):
return {remove_accents(k): remove_accents(v) for k, v in obj.items()}
return obj
cleaned_data = remove_accents(table_data)
wrapped_data = {"data": cleaned_data} if isinstance(cleaned_data, list) else cleaned_data
json_splitter = RecursiveJsonSplitter(max_chunk_size = 512)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
table_splits = json_splitter.create_documents(texts=[wrapped_data])
table_splits = text_splitter.split_documents(table_splits)
return table_splits
def list_docx_files(folder_path):
return [str(file) for file in Path(folder_path).rglob("*.docx")]
def prompt_order(queries):
text = 'IMPORTANT: Here is the questions of user in order, use that and the context above to know the best answer:\n'
i = 0
for q in queries:
i += 1
text += f'Question {i}: {str(q)}\n'
return text |