Spaces:
Runtime error
Runtime error
Commit
·
7f9bd61
1
Parent(s):
e6cba72
fix
Browse files- config.py +1 -1
- db/mongoDB.py +3 -0
- services/document_service.py +2 -2
- utils/utils.py +28 -15
config.py
CHANGED
@@ -17,7 +17,7 @@ CORE_DATA_FOLDER = os.path.join(BASE_DIR, "data", "core")
|
|
17 |
# PENDING_UPLOADS_FOLDER = os.path.join(BASE_DIR, "data", "pending_uploads")
|
18 |
# PROCESSED_FILES_FOLDER = os.path.join(BASE_DIR, "data", "processed_files")
|
19 |
# FAILED_FILES_FOLDER = os.path.join(BASE_DIR, "data", "failed_files")
|
20 |
-
PROCESSED_HASH_LOG = os.path.join(BASE_DIR, "data", "processed_hashes.log")
|
21 |
PENDING_UPLOADS_FOLDER = '/tmp/pending_uploads'
|
22 |
LEGAL_DIC_FOLDER = os.path.join(BASE_DIR, "data", "dictionary")
|
23 |
|
|
|
17 |
# PENDING_UPLOADS_FOLDER = os.path.join(BASE_DIR, "data", "pending_uploads")
|
18 |
# PROCESSED_FILES_FOLDER = os.path.join(BASE_DIR, "data", "processed_files")
|
19 |
# FAILED_FILES_FOLDER = os.path.join(BASE_DIR, "data", "failed_files")
|
20 |
+
# PROCESSED_HASH_LOG = os.path.join(BASE_DIR, "data", "processed_hashes.log")
|
21 |
PENDING_UPLOADS_FOLDER = '/tmp/pending_uploads'
|
22 |
LEGAL_DIC_FOLDER = os.path.join(BASE_DIR, "data", "dictionary")
|
23 |
|
db/mongoDB.py
CHANGED
@@ -20,6 +20,7 @@ class MongoDatabase:
|
|
20 |
users: Optional[AsyncIOMotorCollection] = None
|
21 |
token_blacklist: Optional[AsyncIOMotorCollection] = None
|
22 |
conversations: Optional[AsyncIOMotorCollection] = None
|
|
|
23 |
|
24 |
# Tạo một instance duy nhất để import và sử dụng trong toàn bộ ứng dụng
|
25 |
mongo_db = MongoDatabase()
|
@@ -54,6 +55,8 @@ async def connect_to_mongo():
|
|
54 |
mongo_db.users = mongo_db.db["users"]
|
55 |
mongo_db.token_blacklist = mongo_db.db["token_blacklist"]
|
56 |
mongo_db.conversations = mongo_db.db["conversations"]
|
|
|
|
|
57 |
|
58 |
# 4. Tạo TTL index một cách an toàn
|
59 |
# Lấy danh sách index hiện có
|
|
|
20 |
users: Optional[AsyncIOMotorCollection] = None
|
21 |
token_blacklist: Optional[AsyncIOMotorCollection] = None
|
22 |
conversations: Optional[AsyncIOMotorCollection] = None
|
23 |
+
processed_documents: Optional[AsyncIOMotorCollection] = None
|
24 |
|
25 |
# Tạo một instance duy nhất để import và sử dụng trong toàn bộ ứng dụng
|
26 |
mongo_db = MongoDatabase()
|
|
|
55 |
mongo_db.users = mongo_db.db["users"]
|
56 |
mongo_db.token_blacklist = mongo_db.db["token_blacklist"]
|
57 |
mongo_db.conversations = mongo_db.db["conversations"]
|
58 |
+
mongo_db.processed_documents = mongo_db.db["processed_documents"]
|
59 |
+
|
60 |
|
61 |
# 4. Tạo TTL index một cách an toàn
|
62 |
# Lấy danh sách index hiện có
|
services/document_service.py
CHANGED
@@ -38,7 +38,7 @@ def convert_to_text_content(source_path: str) -> str:
|
|
38 |
logger.info(f"✅ Successfully extracted content from {source_file.name}.")
|
39 |
return content
|
40 |
|
41 |
-
def full_process_and_ingest_pipeline(filepath: str, file_hash: str, embedding_model):
|
42 |
filename = os.path.basename(filepath)
|
43 |
logger.info(f"BACKGROUND TASK: Starting full pipeline for: {filename} (Hash: {file_hash[:10]}...)")
|
44 |
weaviate_client = None
|
@@ -66,7 +66,7 @@ def full_process_and_ingest_pipeline(filepath: str, file_hash: str, embedding_mo
|
|
66 |
|
67 |
ingest_chunks_with_native_batching(weaviate_client, collection_name, processed_chunks, embeddings_model)
|
68 |
|
69 |
-
utils.log_processed_hash(file_hash)
|
70 |
logger.info(f"✅ Successfully ingested '{filename}'.")
|
71 |
# shutil.move(filepath, os.path.join(config.PROCESSED_FILES_FOLDER, filename))
|
72 |
logger.info(f"Moved '{filename}' to processed folder.")
|
|
|
38 |
logger.info(f"✅ Successfully extracted content from {source_file.name}.")
|
39 |
return content
|
40 |
|
41 |
+
async def full_process_and_ingest_pipeline(filepath: str, file_hash: str, embedding_model):
|
42 |
filename = os.path.basename(filepath)
|
43 |
logger.info(f"BACKGROUND TASK: Starting full pipeline for: {filename} (Hash: {file_hash[:10]}...)")
|
44 |
weaviate_client = None
|
|
|
66 |
|
67 |
ingest_chunks_with_native_batching(weaviate_client, collection_name, processed_chunks, embeddings_model)
|
68 |
|
69 |
+
await utils.log_processed_hash(file_hash, filename)
|
70 |
logger.info(f"✅ Successfully ingested '{filename}'.")
|
71 |
# shutil.move(filepath, os.path.join(config.PROCESSED_FILES_FOLDER, filename))
|
72 |
logger.info(f"Moved '{filename}' to processed folder.")
|
utils/utils.py
CHANGED
@@ -7,7 +7,7 @@ from typing import List, Optional
|
|
7 |
from schemas.chat import Message
|
8 |
from redis.asyncio import Redis
|
9 |
import bcrypt
|
10 |
-
from datetime import datetime, timedelta
|
11 |
from jose import jwt
|
12 |
from config import SECRET_KEY, ALGORITHM, ACCESS_TOKEN_EXPIRE_MINUTES
|
13 |
from typing import List, Dict, Optional
|
@@ -455,20 +455,33 @@ def calculate_file_hash(filepath: str) -> str:
|
|
455 |
sha256_hash.update(byte_block)
|
456 |
return sha256_hash.hexdigest()
|
457 |
|
458 |
-
def check_if_hash_exists(file_hash: str) -> bool:
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
def
|
|
|
|
|
|
|
|
|
|
|
470 |
try:
|
471 |
-
|
472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
except IOError as e:
|
474 |
logger.error(f"Could not write to hash log file: {e}")
|
|
|
7 |
from schemas.chat import Message
|
8 |
from redis.asyncio import Redis
|
9 |
import bcrypt
|
10 |
+
from datetime import datetime, timedelta, timezone
|
11 |
from jose import jwt
|
12 |
from config import SECRET_KEY, ALGORITHM, ACCESS_TOKEN_EXPIRE_MINUTES
|
13 |
from typing import List, Dict, Optional
|
|
|
455 |
sha256_hash.update(byte_block)
|
456 |
return sha256_hash.hexdigest()
|
457 |
|
458 |
+
# def check_if_hash_exists(file_hash: str) -> bool:
|
459 |
+
# if not os.path.exists(config.PROCESSED_HASH_LOG):
|
460 |
+
# return False
|
461 |
+
# try:
|
462 |
+
# with open(config.PROCESSED_HASH_LOG, "r") as f:
|
463 |
+
# processed_hashes = {line.strip() for line in f}
|
464 |
+
# return file_hash in processed_hashes
|
465 |
+
# except IOError as e:
|
466 |
+
# logger.error(f"Could not read hash log file: {e}")
|
467 |
+
# return False
|
468 |
+
|
469 |
+
async def check_if_hash_exists(file_hash: str) -> bool:
|
470 |
+
# Đếm số document có hash tương ứng
|
471 |
+
count = await mongo_db.processed_documents.count_documents({"file_hash": file_hash})
|
472 |
+
return count > 0
|
473 |
+
|
474 |
+
async def log_processed_hash(file_hash: str, filename: str):
|
475 |
try:
|
476 |
+
document_record = {
|
477 |
+
"file_hash": file_hash, # Hash của file
|
478 |
+
"original_filename": filename, # Tên file gốc
|
479 |
+
"processed_at": datetime.now(timezone.utc), # Thời gian xử lý
|
480 |
+
"status": "SUCCESS",
|
481 |
+
# Thêm các thông tin khác nếu cần, ví dụ:
|
482 |
+
# "source_url": "https://url_cua_file_tren_s3_hoac_cloudinary",
|
483 |
+
# "user_uploader": user_email
|
484 |
+
}
|
485 |
+
await mongo_db.processed_documents.insert_one(document_record)
|
486 |
except IOError as e:
|
487 |
logger.error(f"Could not write to hash log file: {e}")
|