Spaces:

entidi2608
/

juribot-backend

Runtime error

App Files Files Community

entidi2608 commited on Jun 26

Commit

4f7e10b

1 Parent(s): b45d362

update: upload docs

Browse files

Files changed (2) hide show

config.py +1 -1
services/document_service.py +94 -14

config.py CHANGED Viewed

@@ -44,7 +44,7 @@ SECRET_KEY = os.environ.get("SECRET_KEY")
 ALGORITHM = os.environ.get("ALGORITHM", "HS256")
 ACCESS_TOKEN_EXPIRE_MINUTES = os.environ.get("ACCESS_TOKEN_EXPIRE_MINUTES", 60)
-LLAMA_CLOUD_API_KEY=os.environ.get("LLAMA_CLOUD_API_KEY")
 GOOGLE_CLIENT_ID = os.environ.get("GOOGLE_CLIENT_ID")
 GOOGLE_CLIENT_SECRET = os.environ.get("GOOGLE_CLIENT_SECRET")

 ALGORITHM = os.environ.get("ALGORITHM", "HS256")
 ACCESS_TOKEN_EXPIRE_MINUTES = os.environ.get("ACCESS_TOKEN_EXPIRE_MINUTES", 60)
+LLAMA_CLOUD_API_KEYS=os.environ.get("LLAMA_CLOUD_API_KEYS")
 GOOGLE_CLIENT_ID = os.environ.get("GOOGLE_CLIENT_ID")
 GOOGLE_CLIENT_SECRET = os.environ.get("GOOGLE_CLIENT_SECRET")

services/document_service.py CHANGED Viewed

@@ -18,38 +18,118 @@ logger = logging.getLogger(__name__)
 from rag_components import create_weaviate_schema_if_not_exists, ingest_chunks_with_native_batching
 from utils.process_data import hierarchical_split_law_document,extract_document_metadata,clean_document_text,infer_field, infer_entity_type, filter_and_serialize_complex_metadata
 # --- SỬA LẠI HÀM NÀY ĐỂ NHẬN STREAM ---
 def convert_to_text_content(source_stream: BytesIO, original_filename: str) -> str:
     """Trích xuất nội dung text từ một stream trong bộ nhớ."""
     file_extension = Path(original_filename).suffix.lower()
     logger.info(f"Extracting content from: {original_filename}")
     content = ""
     if file_extension == ".pdf":
-        # LlamaParse có thể cần ghi ra file tạm, chúng ta phải xử lý điều này
-        # Cách 1: Ghi stream ra file tạm trong /tmp
         temp_pdf_path = f"/tmp/{original_filename}"
-        with open(temp_pdf_path, "wb") as f:
-            f.write(source_stream.getvalue())
-        parser = LlamaParse(api_key=config.LLAMA_CLOUD_API_KEY, result_type="text", verbose=True, language="vi")
-        documents = parser.load_data([temp_pdf_path])
-        os.remove(temp_pdf_path) # Dọn dẹp ngay
-        if documents: content = documents[0].text
     elif file_extension == ".docx":
         doc = docx.Document(source_stream)
         content = '\n'.join([para.text for para in doc.paragraphs])
     elif file_extension == ".doc":
         # pypandoc cần file trên đĩa
         temp_doc_path = f"/tmp/{original_filename}"
-        with open(temp_doc_path, "wb") as f:
-            f.write(source_stream.getvalue())
-        content = pypandoc.convert_file(temp_doc_path, 'plain', format='doc')
-        os.remove(temp_doc_path) # Dọn dẹp ngay
     else:
-        raise ValueError(f"Unsupported file format: {file_extension}")
-    logger.info(f"✅ Successfully extracted content from {original_filename}.")
     return content

 from rag_components import create_weaviate_schema_if_not_exists, ingest_chunks_with_native_batching
 from utils.process_data import hierarchical_split_law_document,extract_document_metadata,clean_document_text,infer_field, infer_entity_type, filter_and_serialize_complex_metadata
+class ApiKeyManager:
+    """Quản lý một danh sách các API key."""
+    def __init__(self, api_key_string: str):
+        self.keys = [key.strip() for key in api_key_string.split(',') if key.strip()]
+        if not self.keys:
+            raise ValueError("Chuỗi API key không hợp lệ hoặc rỗng.")
+        self.current_key_index = 0
+        logger.info(f"Đã khởi tạo Key Manager với {len(self.keys)} key.")
+    def get_key(self) -> str | None:
+        """Trả về key hiện tại."""
+        if self.current_key_index < len(self.keys):
+            return self.keys[self.current_key_index]
+        return None
+    def get_next_key(self) -> str | None:
+        """Chuyển sang key tiếp theo và trả về nó."""
+        self.current_key_index += 1
+        logger.warning(f"Chuyển sang sử dụng API key tiếp theo (index: {self.current_key_index}).")
+        return self.get_key()
+    def reset(self):
+        """Reset lại index để bắt đầu từ key đầu tiên cho lần xử lý mới."""
+        self.current_key_index = 0
+        logger.info("Key Manager đã được reset.")
+llama_key_manager = ApiKeyManager(config.LLAMA_CLOUD_API_KEYS)
 # --- SỬA LẠI HÀM NÀY ĐỂ NHẬN STREAM ---
 def convert_to_text_content(source_stream: BytesIO, original_filename: str) -> str:
     """Trích xuất nội dung text từ một stream trong bộ nhớ."""
     file_extension = Path(original_filename).suffix.lower()
     logger.info(f"Extracting content from: {original_filename}")
     content = ""
+    source_stream.seek(0)
     if file_extension == ".pdf":
+        # Do LlamaParse cần đường dẫn file, chúng ta sẽ ghi stream ra file tạm MỘT LẦN
+        # và tái sử dụng đường dẫn này trong vòng lặp thử key.
+        # Tạo tên file tạm duy nhất để tránh xung đột khi xử lý song song
         temp_pdf_path = f"/tmp/{original_filename}"
+        try:
+            with open(temp_pdf_path, "wb") as f:
+                f.write(source_stream.getvalue())
+            # Reset key manager trước khi bắt đầu để đảm bảo nó luôn thử từ key đầu tiên
+            llama_key_manager.reset()
+            # Bắt đầu vòng lặp để thử các API key
+            while (current_key := llama_key_manager.get_key()) is not None:
+                try:
+                    logger.info(f"Đang thử chuyển đổi PDF '{original_filename}' bằng key index: {llama_key_manager.current_key_index}...")
+                    parser = LlamaParse(
+                        api_key=current_key,
+                        result_type="text",
+                        verbose=True, # Giữ để debug
+                        language="vi"
+                    )
+                    # Sử dụng đường dẫn file tạm đã tạo
+                    documents = parser.load_data([temp_pdf_path])
+                    if documents and documents[0].text.strip():
+                        content = documents[0].text
+                        logger.info(f"✅ Chuyển đổi PDF thành công bằng key index: {llama_key_manager.current_key_index}.")
+                        break # Thành công, thoát khỏi vòng lặp
+                    else:
+                        raise ValueError("LlamaParse trả về nội dung rỗng.")
+                except Exception as e:
+                    logger.error(f"❌ Lỗi với key index {llama_key_manager.current_key_index} cho file '{original_filename}': {e}")
+                    if llama_key_manager.get_next_key() is None:
+                        logger.critical("Đã thử hết tất cả các API key nhưng đều thất bại cho file PDF.")
+                        raise Exception(f"Không thể chuyển đổi file '{original_filename}' sau khi đã thử tất cả các API key.") from e
+            if not content:
+                raise ValueError(f"Không thể trích xuất nội dung từ PDF '{original_filename}' sau khi thử các key.")
+        finally:
+            # Luôn dọn dẹp file tạm, dù thành công hay thất bại
+            if os.path.exists(temp_pdf_path):
+                os.remove(temp_pdf_path)
+                logger.debug(f"Đã dọn dẹp file tạm: {temp_pdf_path}")
     elif file_extension == ".docx":
+        # docx có thể đọc trực tiếp từ stream
         doc = docx.Document(source_stream)
         content = '\n'.join([para.text for para in doc.paragraphs])
     elif file_extension == ".doc":
         # pypandoc cần file trên đĩa
         temp_doc_path = f"/tmp/{original_filename}"
+        try:
+            with open(temp_doc_path, "wb") as f:
+                f.write(source_stream.getvalue())
+            content = pypandoc.convert_file(temp_doc_path, 'plain', format='doc')
+        finally:
+             if os.path.exists(temp_doc_path):
+                os.remove(temp_doc_path)
+                logger.debug(f"Đã dọn dẹp file tạm: {temp_doc_path}")
     else:
+        raise ValueError(f"Định dạng file không được hỗ trợ: {file_extension}")
+    if not content.strip():
+        raise ValueError(f"Nội dung trích xuất từ '{original_filename}' bị rỗng.")
+    logger.info(f"✅ Trích xuất nội dung thành công từ stream của file: {original_filename}.")
     return content