Spaces:

entidi2608
/

juribot-backend

Runtime error

App Files Files Community

entidi2608 commited on Jun 25

Commit

d3e9dc7

1 Parent(s): 253c2e1

update: check file uploaded

Browse files

Files changed (8) hide show

Dockerfile +18 -24
config.py +2 -2
db/mongoDB.py +1 -1
dependencies.py +4 -20
main.py +3 -3
routers/documents.py +1 -111
services/document_service.py +2 -67
utils/utils.py +1 -22

Dockerfile CHANGED Viewed

@@ -1,41 +1,33 @@
 # =================================================================
 # STAGE 1: BUILDER - Stage để cài đặt các dependencies nặng
 # =================================================================
-# Sử dụng image đầy đủ để có các công cụ build cần thiết
 FROM python:3.10 as builder
 # Cập nhật và cài đặt các gói hệ thống cho việc build
-# Chỉ cài những gì thực sự cần để `pip install` hoạt động
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
-# Thiết lập thư mục làm việc
 WORKDIR /app
-# Tạo một môi trường ảo (virtual environment) để quản lý dependencies
-# Đây là một thực hành tốt, giúp cô lập thư viện
 RUN python -m venv /opt/venv
-# Kích hoạt venv cho các lệnh RUN tiếp theo
 ENV PATH="/opt/venv/bin:$PATH"
 # Sao chép file requirements trước để tận dụng Docker layer caching
 COPY requirements.txt .
-# Cài đặt tất cả các thư viện Python trong một lệnh RUN duy nhất
-# Điều này giúp tối ưu hóa số lượng layer của Docker
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
 # =================================================================
 # STAGE 2: FINAL - Stage cuối cùng, nhỏ gọn để chạy ứng dụng
 # =================================================================
-# Bắt đầu từ một image slim siêu nhẹ
 FROM python:3.10-slim
 # Cài đặt chỉ các dependencies hệ thống cần thiết cho RUNTIME
-# Không cần `build-essential`, `git`, `curl` ở đây nữa
 RUN apt-get update && apt-get install -y --no-install-recommends \
     poppler-utils \
     libgl1-mesa-glx \
@@ -43,37 +35,39 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
-# Thiết lập thư mục làm việc
 WORKDIR /app
-# Sao chép môi trường ảo đã được cài đặt sẵn từ stage builder
 COPY --from=builder /opt/venv /opt/venv
-# Kích hoạt virtual environment cho container
 ENV PATH="/opt/venv/bin:$PATH"
 # Thiết lập các biến môi trường quan trọng
-# Thư mục cache sẽ nằm bên trong container
-ENV HF_HOME=/app/cache
 ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1
-# Đảm bảo log Python hiển thị ngay lập tức, rất quan trọng cho việc debug trên Render
 ENV PYTHONUNBUFFERED=1
 # Sao chép toàn bộ mã nguồn của ứng dụng
 COPY . .
 # Tải trước (pre-download/bake) các model vào trong image
-# Điều này giúp giảm đáng kể thời gian khởi động (cold start) trên Render.
-# Các model sẽ được lưu vào thư mục cache đã định nghĩa bởi HF_HOME.
-# **QUAN TRỌNG**: Đảm bảo tên model ở đây khớp chính xác với tên trong file config.py của bạn.
 RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')"
 RUN python -c "from langchain_community.cross_encoders import HuggingFaceCrossEncoder; HuggingFaceCrossEncoder(model_name='cross-encoder/ms-marco-MiniLM-L-6-v2')"
-# Mở cổng mà ứng dụng sẽ lắng nghe bên trong container
-# Port này phải khớp với port trong lệnh CMD
 EXPOSE 7860
-# Lệnh chạy ứng dụng cho PRODUCTION sử dụng Gunicorn
-# Gunicorn ổn định và hiệu quả hơn Uvicorn --reload
-# Nó sẽ tự động sử dụng biến $PORT do Render cung cấp
 CMD ["gunicorn", "-w", "2", "-k", "uvicorn.workers.UvicornWorker", "main:app", "--bind", "0.0.0.0:7860", "--timeout", "120"]

 # =================================================================
 # STAGE 1: BUILDER - Stage để cài đặt các dependencies nặng
 # =================================================================
 FROM python:3.10 as builder
 # Cập nhật và cài đặt các gói hệ thống cho việc build
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
+# Tạo và kích hoạt venv
 RUN python -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 # Sao chép file requirements trước để tận dụng Docker layer caching
 COPY requirements.txt .
+# Cài đặt thư viện Python
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
 # =================================================================
 # STAGE 2: FINAL - Stage cuối cùng, nhỏ gọn để chạy ứng dụng
 # =================================================================
 FROM python:3.10-slim
 # Cài đặt chỉ các dependencies hệ thống cần thiết cho RUNTIME
 RUN apt-get update && apt-get install -y --no-install-recommends \
     poppler-utils \
     libgl1-mesa-glx \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
+# Sao chép môi trường ảo từ stage builder
 COPY --from=builder /opt/venv /opt/venv
+# Kích hoạt virtual environment
 ENV PATH="/opt/venv/bin:$PATH"
+# --- PHẦN SỬA ĐỔI QUAN TRỌNG ---
 # Thiết lập các biến môi trường quan trọng
+# 1. **SỬA LẠI ĐÂY**: Trỏ thư mục cache vào /tmp, nơi ứng dụng có quyền ghi
+#    Điều này sẽ sửa lỗi "Permission denied"
+ENV HF_HOME=/tmp/huggingface_cache
+ENV SENTENCE_TRANSFORMERS_HOME=/tmp/huggingface_cache
+# 2. Tạo thư mục cache và cấp quyền (thực hành tốt)
+RUN mkdir -p /tmp/huggingface_cache && chmod 777 /tmp/huggingface_cache
+# 3. Các biến môi trường khác giữ nguyên
 ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1
 ENV PYTHONUNBUFFERED=1
+# --- KẾT THÚC PHẦN SỬA ĐỔI ---
 # Sao chép toàn bộ mã nguồn của ứng dụng
 COPY . .
 # Tải trước (pre-download/bake) các model vào trong image
+# Bây giờ các model sẽ được lưu vào /tmp/huggingface_cache bên trong image
 RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')"
 RUN python -c "from langchain_community.cross_encoders import HuggingFaceCrossEncoder; HuggingFaceCrossEncoder(model_name='cross-encoder/ms-marco-MiniLM-L-6-v2')"
+# Mở cổng
 EXPOSE 7860
+# Lệnh chạy ứng dụng
 CMD ["gunicorn", "-w", "2", "-k", "uvicorn.workers.UvicornWorker", "main:app", "--bind", "0.0.0.0:7860", "--timeout", "120"]

config.py CHANGED Viewed

@@ -18,7 +18,7 @@ CORE_DATA_FOLDER = os.path.join(BASE_DIR, "data", "core")
 # PROCESSED_FILES_FOLDER = os.path.join(BASE_DIR, "data", "processed_files")
 # FAILED_FILES_FOLDER = os.path.join(BASE_DIR, "data", "failed_files")
 # PROCESSED_HASH_LOG = os.path.join(BASE_DIR, "data", "processed_hashes.log")
-PENDING_UPLOADS_FOLDER = '/tmp/pending_uploads'
 LEGAL_DIC_FOLDER = os.path.join(BASE_DIR, "data", "dictionary")
 # Cấu hình cho DB
@@ -53,7 +53,7 @@ FRONTEND_URL = os.environ.get("FRONTEND_URL")
 APP_ENVIRONMENT = os.environ.get("APP_ENVIRONMENT")
-CHECKPOINT_FILE = "processed_files.log"
 MONGODB_CLOUD_URI= os.environ.get("MONGODB_CLOUD_URI")
 DB_NAME= os.environ.get("DB_NAME")

 # PROCESSED_FILES_FOLDER = os.path.join(BASE_DIR, "data", "processed_files")
 # FAILED_FILES_FOLDER = os.path.join(BASE_DIR, "data", "failed_files")
 # PROCESSED_HASH_LOG = os.path.join(BASE_DIR, "data", "processed_hashes.log")
+# PENDING_UPLOADS_FOLDER = '/tmp/pending_uploads'
 LEGAL_DIC_FOLDER = os.path.join(BASE_DIR, "data", "dictionary")
 # Cấu hình cho DB
 APP_ENVIRONMENT = os.environ.get("APP_ENVIRONMENT")
+# CHECKPOINT_FILE = "processed_files.log"
 MONGODB_CLOUD_URI= os.environ.get("MONGODB_CLOUD_URI")
 DB_NAME= os.environ.get("DB_NAME")

db/mongoDB.py CHANGED Viewed

@@ -31,7 +31,7 @@ async def connect_to_mongo():
     Hàm này sẽ được gọi từ lifespan của FastAPI.
     """
     if mongo_db.client:
-        logger.info("MongoDB connection already established.")
         return
     logger.info(f"🔸 Connecting to MongoDB Atlas...")

     Hàm này sẽ được gọi từ lifespan của FastAPI.
     """
     if mongo_db.client:
+        logger.info("✅ MongoDB connection already established.")
         return
     logger.info(f"🔸 Connecting to MongoDB Atlas...")

dependencies.py CHANGED Viewed

@@ -38,22 +38,6 @@ def get_app_state(request: Request):
         raise RuntimeError("Application state ('app_state') not found. Initialization failed?")
     return request.app.state.app_state
-# def initialize_redis_client():
-#     redis_url = os.environ.get("REDIS_URL")
-#     if not redis_url:
-#         logger.error("🔸[Redis] REDIS_URL environment variable not set.")
-#         raise ValueError("REDIS_URL is not configured.")
-#     try:
-#         logger.info(f"🔸[Redis] Attempting to connect to Redis at {redis_url}...")
-#         client = redis.Redis.from_url(redis_url, socket_connect_timeout=5, socket_timeout=5)
-#         logger.info("🔸[Redis] Connected successfully and pinged.")
-#         return client
-#     except redis.exceptions.ConnectionError as e:
-#         logger.error(f"🔸[Redis] Connection failed for URL '{redis_url}': {e}")
-#         raise ConnectionError(f"Failed to connect to Redis: {e}")
-#     except Exception as e:
-#         logger.error(f"🔸[Redis] Error initializing Redis from URL '{redis_url}': {e}")
-#         raise RuntimeError(f"Error initializing Redis: {e}")
 async def initialize_api_components(app_state: AppState):
     """Khởi tạo các thành phần cần thiết cho API """
@@ -184,7 +168,7 @@ async def get_current_user(
         logger.error(f"GET_CURRENT_USER: *** KHÔNG TÌM THẤY TOKEN (Nguồn: {source_of_token}) - RAISING 401 ***")
         raise credentials_exception
-    logger.info(f"GET_CURRENT_USER: Token để verify (nguồn: {source_of_token}): {token_to_verify[:20]}...")
     # 1. Kiểm tra token trong blacklist
     try:
@@ -221,7 +205,7 @@ async def get_current_user(
         email = payload.get("sub")
         exp = payload.get("exp")
-        logger.info(f"GET_CURRENT_USER: JWT decode thành công - email: {email}, exp: {exp}")
         if not isinstance(email, str) or not email:
             logger.error("GET_CURRENT_USER: *** EMAIL KHÔNG HỢP LỆ TRONG TOKEN ***")
@@ -265,7 +249,7 @@ async def get_current_user(
     # 3. Lấy thông tin người dùng từ database
     user_data: Optional[dict] = None # Khởi tạo để tránh UnboundLocalError
     try:
-        logger.info(f"GET_CURRENT_USER: Đang tìm user trong DB: {email.lower()}") # email đã được validate là str
         user_data = await mongo_db.users.find_one({"email": email.lower()}, {"password": 0, "_id": 0})
         # print(user_data) # Bỏ print trong production
@@ -273,7 +257,7 @@ async def get_current_user(
             logger.error(f"GET_CURRENT_USER: *** KHÔNG TÌM THẤY USER TRONG DB ({email.lower()}) - RAISING 401 ***")
             raise credentials_exception
-        logger.info(f"GET_CURRENT_USER: Tìm thấy user - data: {user_data}")
     except HTTPException:
         raise

         raise RuntimeError("Application state ('app_state') not found. Initialization failed?")
     return request.app.state.app_state
 async def initialize_api_components(app_state: AppState):
     """Khởi tạo các thành phần cần thiết cho API """
         logger.error(f"GET_CURRENT_USER: *** KHÔNG TÌM THẤY TOKEN (Nguồn: {source_of_token}) - RAISING 401 ***")
         raise credentials_exception
+    # logger.info(f"GET_CURRENT_USER: Token để verify (nguồn: {source_of_token}): {token_to_verify[:20]}...")
     # 1. Kiểm tra token trong blacklist
     try:
         email = payload.get("sub")
         exp = payload.get("exp")
+        # logger.info(f"GET_CURRENT_USER: JWT decode thành công - email: {email}, exp: {exp}")
         if not isinstance(email, str) or not email:
             logger.error("GET_CURRENT_USER: *** EMAIL KHÔNG HỢP LỆ TRONG TOKEN ***")
     # 3. Lấy thông tin người dùng từ database
     user_data: Optional[dict] = None # Khởi tạo để tránh UnboundLocalError
     try:
+        # logger.info(f"GET_CURRENT_USER: Đang tìm user trong DB: {email.lower()}") # email đã được validate là str
         user_data = await mongo_db.users.find_one({"email": email.lower()}, {"password": 0, "_id": 0})
         # print(user_data) # Bỏ print trong production
             logger.error(f"GET_CURRENT_USER: *** KHÔNG TÌM THẤY USER TRONG DB ({email.lower()}) - RAISING 401 ***")
             raise credentials_exception
+        # logger.info(f"GET_CURRENT_USER: Tìm thấy user - data: {user_data}")
     except HTTPException:
         raise

main.py CHANGED Viewed

@@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    logger.info("✅ [Lifespan] STARTING UP...")
     current_app_state_instance = AppState()
     initialization_successful = False
@@ -71,7 +71,7 @@ app = FastAPI(
 app.add_middleware(
     SessionMiddleware,
-    secret_key=os.environ.get("SESSION_SECRET_KEY", "a_very_secret_key_for_development")
 )
@@ -93,7 +93,7 @@ app.include_router(health_router, prefix="/api", tags=["Status"]) # Hoặc chỉ
 # Run with Uvicorn
 if __name__ == "__main__":
-    logger.info("=> Chạy FastAPI server với Uvicorn...")
     is_dev_mode = config.APP_ENVIRONMENT.lower() == "development"
     uvicorn.run(
         "main:app", # Đảm bảo "main" là tên file python của bạn

 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    logger.info("🚀 [Lifespan] STARTING UP...")
     current_app_state_instance = AppState()
     initialization_successful = False
 app.add_middleware(
     SessionMiddleware,
+    secret_key=os.environ.get("SESSION_SECRET_KEY")
 )
 # Run with Uvicorn
 if __name__ == "__main__":
+    logger.info("🚀 Chạy FastAPI server với Uvicorn...")
     is_dev_mode = config.APP_ENVIRONMENT.lower() == "development"
     uvicorn.run(
         "main:app", # Đảm bảo "main" là tên file python của bạn

routers/documents.py CHANGED Viewed

@@ -1,113 +1,3 @@
-# from fastapi import APIRouter, UploadFile, File, BackgroundTasks, HTTPException, Depends, Request
-# import os
-# import time
-# import shutil
-# from schemas.user import UserOut
-# from dependencies import get_current_user
-# import logging
-# from typing import List
-# import config
-# from utils.utils import calculate_file_hash, check_if_hash_exists
-# from services.document_service import full_process_and_ingest_pipeline
-# from dependencies import get_app_state
-# logger = logging.getLogger(__name__)
-# router = APIRouter()
-# ALLOWED_EXTENSIONS = {".pdf", ".docx", ".doc"}
-# @router.post("/upload", status_code=202)
-# async def upload_and_ingest_documents(
-#     fastapi_request: Request,
-#     background_tasks: BackgroundTasks,
-#     current_user: UserOut = Depends(get_current_user),
-#     files: List[UploadFile] = File(..., description="Một hoặc nhiều file tài liệu cần upload.")
-# ):
-#     """
-#     Endpoint duy nhất để upload một hoặc nhiều tài liệu.
-#     - **files**: Danh sách các file tài liệu cần upload.
-#     - API sẽ xử lý từng file trong nền và trả về ngay một báo cáo tổng hợp.
-#     - File trùng lặp (dựa trên nội dung) hoặc có định dạng không hỗ trợ sẽ bị bỏ qua.
-#     """
-#     # Dòng này giờ sẽ chạy thành công vì bạn có quyền ghi vào /tmp
-#     os.makedirs(config.PENDING_UPLOADS_FOLDER , exist_ok=True)
-#     app_state = get_app_state(request=fastapi_request)
-#     embedding_model = app_state.embeddings
-#     if not files:
-#         raise HTTPException(status_code=400, detail="No files were uploaded.")
-#     accepted_files = []
-#     skipped_files = []
-#     for file in files:
-#         temp_file_path = None
-#         try:
-#             # Kiểm tra định dạng file
-#             file_extension = os.path.splitext(file.filename)[1].lower()
-#             if file_extension not in ALLOWED_EXTENSIONS:
-#                 skipped_files.append({"filename": file.filename, "reason": "Unsupported file type"})
-#                 continue
-#             # 1. Lưu file tạm để tính hash
-#             # Thêm timestamp để tránh xung đột tên file nếu upload nhiều file cùng tên trong 1 request
-#             temp_filename = f"temp_{int(time.time()*1000)}_{file.filename}"
-#             temp_file_path = os.path.join(config.PENDING_UPLOADS_FOLDER, temp_filename)
-#             with open(temp_file_path, "wb") as buffer:
-#                 shutil.copyfileobj(file.file, buffer)
-#             # 2. Tính toán hash
-#             file_hash = calculate_file_hash(temp_file_path)
-#             # 3. Kiểm tra trùng lặp
-#             if await check_if_hash_exists(file_hash):
-#                 skipped_files.append({"filename": file.filename, "reason": "Duplicate file content"})
-#                 os.remove(temp_file_path)
-#                 continue
-#             # 4. File hợp lệ, chuẩn bị để xử lý
-#             final_filename = file.filename
-#             final_file_path = os.path.join(config.PENDING_UPLOADS_FOLDER, final_filename)
-#             # Xử lý nếu tên file đã tồn tại để tránh ghi đè
-#             if os.path.exists(final_file_path):
-#                  base, ext = os.path.splitext(final_filename)
-#                  final_filename = f"{base}_{file_hash[:8]}{ext}"
-#                  final_file_path = os.path.join(config.PENDING_UPLOADS_FOLDER, final_filename)
-#             os.rename(temp_file_path, final_file_path)
-#             temp_file_path = None # Đánh dấu là đã di chuyển
-#             # 5. Thêm tác vụ nền cho file này
-#             background_tasks.add_task(full_process_and_ingest_pipeline, final_file_path, file_hash,embedding_model)
-#             accepted_files.append({"filename": final_filename, "hash": file_hash})
-#         except Exception as e:
-#             logger.error(f"Error processing {file.filename} in upload batch: {e}", exc_info=True)
-#             skipped_files.append({"filename": file.filename, "reason": f"Server error: {str(e)}"})
-#             if temp_file_path and os.path.exists(temp_file_path):
-#                 os.remove(temp_file_path)
-#     # Nếu không có file nào được chấp nhận sau khi lọc
-#     if not accepted_files:
-#         raise HTTPException(
-#             status_code=400,
-#             detail={"message": "No valid new files were accepted for processing.", "skipped_files": skipped_files}
-#         )
-#     # Trả về kết quả tổng hợp
-#     return {
-#         "message": f"Request completed. Accepted {len(accepted_files)} files for background processing.",
-#         "accepted_files": accepted_files,
-#         "skipped_files": skipped_files
-#     }
-# routers/documents.py
 import os
 import hashlib
 from typing import List
@@ -116,7 +6,7 @@ from io import BytesIO
 from fastapi import APIRouter, UploadFile, File, HTTPException, Depends, BackgroundTasks
 from fastapi.concurrency import run_in_threadpool
-from utils.utils import  check_if_hash_exists
 from services.document_service import full_process_and_ingest_pipeline,convert_to_text_content

 import os
 import hashlib
 from typing import List
 from fastapi import APIRouter, UploadFile, File, HTTPException, Depends, BackgroundTasks
 from fastapi.concurrency import run_in_threadpool
+from services.document_service import  check_if_hash_exists
 from services.document_service import full_process_and_ingest_pipeline,convert_to_text_content

services/document_service.py CHANGED Viewed

@@ -18,71 +18,6 @@ logger = logging.getLogger(__name__)
 from rag_components import create_weaviate_schema_if_not_exists, ingest_chunks_with_native_batching
 from utils.process_data import hierarchical_split_law_document,extract_document_metadata,clean_document_text,infer_field, infer_entity_type, filter_and_serialize_complex_metadata
-# def convert_to_text_content(source_path: str) -> str:
-#     source_file = Path(source_path)
-#     file_extension = source_file.suffix.lower()
-#     logger.info(f"Extracting content from: {source_file.name}")
-#     content = ""
-#     if file_extension == ".pdf":
-#         parser = LlamaParse( api_key=config.LLAMA_CLOUD_API_KEY,
-#                     result_type="text",
-#                     verbose=True, # Giữ verbose để theo dõi
-#                     language="vi")
-#         documents = parser.load_data([str(source_file)])
-#         if documents: content = documents[0].text
-#     elif file_extension == ".docx":
-#         doc = docx.Document(source_path)
-#         content = '\n'.join([para.text for para in doc.paragraphs])
-#     elif file_extension == ".doc":
-#         content = pypandoc.convert_file(source_path, 'plain', format='doc')
-#     else:
-#         raise ValueError(f"Unsupported file format: {file_extension}")
-#     if not content.strip():
-#         raise ValueError("Extracted content is empty.")
-#     logger.info(f"✅ Successfully extracted content from {source_file.name}.")
-#     return content
-# async def full_process_and_ingest_pipeline(filepath: str, file_hash: str, embedding_model):
-#     filename = os.path.basename(filepath)
-#     logger.info(f"BACKGROUND TASK: Starting full pipeline for: {filename} (Hash: {file_hash[:10]}...)")
-#     weaviate_client = None
-#     try:
-#         raw_content = convert_to_text_content(filepath)
-#         doc_metadata = extract_document_metadata(raw_content, filename)
-#         doc_metadata["source"] = filename
-#         cleaned_content = clean_document_text(raw_content)
-#         doc_metadata["field"] = infer_field(cleaned_content, doc_metadata.get("ten_van_ban"))
-#         doc_metadata["entity_type"] = infer_entity_type(cleaned_content, doc_metadata.get("field", ""))
-#         doc_to_split = Document(page_content=cleaned_content, metadata=doc_metadata)
-#         chunks_from_file = hierarchical_split_law_document(doc_to_split)
-#         if not chunks_from_file:
-#             raise ValueError("File did not yield any chunks after processing.")
-#         processed_chunks = filter_and_serialize_complex_metadata(chunks_from_file)
-#         weaviate_client = connect_to_weaviate()
-#         embeddings_model = embedding_model
-#         collection_name = config.WEAVIATE_COLLECTION_NAME
-#         create_weaviate_schema_if_not_exists(weaviate_client, collection_name)
-#         ingest_chunks_with_native_batching(weaviate_client, collection_name, processed_chunks, embeddings_model)
-#         await utils.log_processed_hash(file_hash, filename)
-#         logger.info(f"✅ Successfully ingested '{filename}'.")
-#         # shutil.move(filepath, os.path.join(config.PROCESSED_FILES_FOLDER, filename))
-#         logger.info(f"Moved '{filename}' to processed folder.")
-#     except Exception as e:
-#         logger.error(f"❌ FAILED pipeline for '{filename}': {e}", exc_info=True)
-#         # shutil.move(filepath, os.path.join(config.FAILED_FILES_FOLDER, filename))
-#         logger.info(f"Moved '{filename}' to failed folder.")
-#     finally:
-#         if weaviate_client and weaviate_client.is_connected():
-#             weaviate_client.close()
 # --- SỬA LẠI HÀM NÀY ĐỂ NHẬN STREAM ---
 def convert_to_text_content(source_stream: BytesIO, original_filename: str) -> str:
     """Trích xuất nội dung text từ một stream trong bộ nhớ."""
@@ -147,7 +82,7 @@ async def full_process_and_ingest_pipeline(raw_content: str, filename: str, file
         processed_chunks = filter_and_serialize_complex_metadata(chunks_from_file)
         # Giai đoạn 2: Ingest vào Weaviate (I/O-bound và CPU-bound)
-        weaviate_client = connect_to_weaviate()
         await run_in_threadpool(create_weaviate_schema_if_not_exists, weaviate_client, config.WEAVIATE_COLLECTION_NAME)
@@ -191,4 +126,4 @@ async def log_failed_process(file_hash: str, filename: str, error_message: str):
 # Hàm kiểm tra trùng lặp
 async def check_if_hash_exists(file_hash: str) -> bool:
     count = await mongo_db.processed_documents.count_documents({"file_hash": file_hash, "status": "SUCCESS"})
-    return count > 0

 from rag_components import create_weaviate_schema_if_not_exists, ingest_chunks_with_native_batching
 from utils.process_data import hierarchical_split_law_document,extract_document_metadata,clean_document_text,infer_field, infer_entity_type, filter_and_serialize_complex_metadata
 # --- SỬA LẠI HÀM NÀY ĐỂ NHẬN STREAM ---
 def convert_to_text_content(source_stream: BytesIO, original_filename: str) -> str:
     """Trích xuất nội dung text từ một stream trong bộ nhớ."""
         processed_chunks = filter_and_serialize_complex_metadata(chunks_from_file)
         # Giai đoạn 2: Ingest vào Weaviate (I/O-bound và CPU-bound)
+        weaviate_client = connect_to_weaviate(run_diagnostics=False)
         await run_in_threadpool(create_weaviate_schema_if_not_exists, weaviate_client, config.WEAVIATE_COLLECTION_NAME)
 # Hàm kiểm tra trùng lặp
 async def check_if_hash_exists(file_hash: str) -> bool:
     count = await mongo_db.processed_documents.count_documents({"file_hash": file_hash, "status": "SUCCESS"})
+    return count > 0

utils/utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import List, Optional
 from schemas.chat import  Message
 from redis.asyncio import Redis
 import bcrypt
-from datetime import datetime, timedelta, timezone
 from jose import jwt
 from config import SECRET_KEY, ALGORITHM, ACCESS_TOKEN_EXPIRE_MINUTES
 from typing import List, Dict,  Optional
@@ -444,7 +444,6 @@ async def get_langchain_chat_history(app_state, chat_id: str) -> RedisChatMessag
 # api/utils.py
 import hashlib
-import config
 logger = logging.getLogger(__name__)
@@ -465,23 +464,3 @@ def calculate_file_hash(filepath: str) -> str:
 #     except IOError as e:
 #         logger.error(f"Could not read hash log file: {e}")
 #         return False
-async def check_if_hash_exists(file_hash: str) -> bool:
-    # Đếm số document có hash tương ứng
-    count = await mongo_db.processed_documents.count_documents({"file_hash": file_hash})
-    return count > 0
-async def log_processed_hash(file_hash: str, filename: str):
-    try:
-        document_record = {
-            "file_hash": file_hash,          # Hash của file
-            "original_filename": filename,   # Tên file gốc
-            "processed_at": datetime.now(timezone.utc), # Thời gian xử lý
-            "status": "SUCCESS",
-            # Thêm các thông tin khác nếu cần, ví dụ:
-            # "source_url": "https://url_cua_file_tren_s3_hoac_cloudinary",
-            # "user_uploader": user_email
-        }
-        await mongo_db.processed_documents.insert_one(document_record)
-    except IOError as e:
-        logger.error(f"Could not write to hash log file: {e}")

 from schemas.chat import  Message
 from redis.asyncio import Redis
 import bcrypt
+from datetime import datetime, timedelta
 from jose import jwt
 from config import SECRET_KEY, ALGORITHM, ACCESS_TOKEN_EXPIRE_MINUTES
 from typing import List, Dict,  Optional
 # api/utils.py
 import hashlib
 logger = logging.getLogger(__name__)
 #     except IOError as e:
 #         logger.error(f"Could not read hash log file: {e}")
 #         return False