Spaces:

entidi2608
/

juribot-backend

Runtime error

App Files Files Community

juribot-backend / routers /documents.py

entidi2608

update: check file uploaded

d3e9dc7 2 months ago

raw

history blame contribute delete

4.27 kB

	import os
	import hashlib
	from typing import List
	from io import BytesIO

	from fastapi import APIRouter, UploadFile, File, HTTPException, Depends, BackgroundTasks
	from fastapi.concurrency import run_in_threadpool

	from services.document_service import check_if_hash_exists
	from services.document_service import full_process_and_ingest_pipeline,convert_to_text_content


	from config import ALLOWED_EXTENSIONS
	from dependencies import get_current_user
	from schemas.user import UserOut
	import logging
	logger = logging.getLogger(__name__)

	router = APIRouter()


	def calculate_hash_from_content(content: bytes) -> str:
	"""Tính toán SHA256 hash từ nội dung bytes của file."""
	return hashlib.sha256(content).hexdigest()


	@router.post("/upload", status_code=202)
	async def upload_and_ingest_documents(
	background_tasks: BackgroundTasks,
	current_user: UserOut = Depends(get_current_user),
	files: List[UploadFile] = File(..., description="Một hoặc nhiều file tài liệu cần upload.")
	):
	"""
	Endpoint đã được tối ưu để upload và xử lý tài liệu.
	- Xử lý file trong bộ nhớ để tránh lỗi permission và tăng tốc.
	- Trích xuất text ngay trong request để báo lỗi sớm (fail-fast).
	- Đẩy các tác vụ xử lý nặng (NLP, embedding, ingest) vào nền.
	"""
	if not files:
	raise HTTPException(status_code=400, detail="No files were uploaded.")

	accepted_files = []
	skipped_files = []

	for file in files:
	try:
	# 1. KIỂM TRA ĐỊNH DẠNG
	file_extension = os.path.splitext(file.filename)[1].lower()
	if file_extension not in ALLOWED_EXTENSIONS:
	skipped_files.append({"filename": file.filename, "reason": "Unsupported file type"})
	continue

	# 2. ĐỌC NỘI DUNG FILE VÀO BỘ NHỚ
	file_content = await file.read()
	if not file_content:
	skipped_files.append({"filename": file.filename, "reason": "Empty file"})
	continue

	# 3. TÍNH HASH & KIỂM TRA TRÙNG LẶP
	file_hash = calculate_hash_from_content(file_content)
	if await check_if_hash_exists(file_hash):
	skipped_files.append({"filename": file.filename, "reason": "Duplicate file (content already exists)"})
	continue

	# 4. TRÍCH XUẤT TEXT (FAIL-FAST) - Chạy trong thread pool để không block
	# Hàm convert_to_text_content cần được sửa để nhận BytesIO
	try:
	raw_content = await run_in_threadpool(convert_to_text_content, BytesIO(file_content), file.filename)
	if not raw_content or not raw_content.strip():
	raise ValueError("Extracted content is empty.")
	except Exception as extraction_error:
	logger.error(f"Failed to extract text from {file.filename}: {extraction_error}")
	skipped_files.append({"filename": file.filename, "reason": f"Failed to extract content: {extraction_error}"})
	continue

	# 5. THÊM TÁC VỤ NỀN VỚI DỮ LIỆU TEXT
	# Chỉ truyền dữ liệu cơ bản, không truyền đối tượng phức tạp
	background_tasks.add_task(
	full_process_and_ingest_pipeline,
	raw_content=raw_content,
	filename=file.filename,
	file_hash=file_hash
	)

	accepted_files.append({"filename": file.filename, "hash": file_hash})

	except Exception as e:
	logger.error(f"Error preparing {file.filename} for background processing: {e}", exc_info=True)
	skipped_files.append({"filename": file.filename, "reason": f"Unexpected server error: {str(e)}"})

	if not accepted_files:
	raise HTTPException(
	status_code=400,
	detail={"message": "No valid new files were accepted for processing.", "skipped_files": skipped_files}
	)

	return {
	"message": f"Request received. Accepted {len(accepted_files)} files for background processing.",
	"accepted_files": accepted_files,
	"skipped_files": skipped_files
	}