from fastapi import FastAPI from pydantic import BaseModel from typing import Optional from llama_index.core import Document, ServiceContext from llama_index.llms.llama_cpp import LlamaCPP from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core.node_parser import SemanticSplitterNodeParser app = FastAPI() # 📥 Modèle de la requête JSON envoyée à /chunk class ChunkRequest(BaseModel): text: str source_id: Optional[str] = None titre: Optional[str] = None source: Optional[str] = None type: Optional[str] = None @app.post("/chunk") async def chunk_text(data: ChunkRequest): # ✅ Chargement direct d’un modèle hébergé sur Hugging Face (pas de fichier local .gguf) llm = LlamaCPP( model_url="https://huggingface.co/leafspark/Mistral-7B-Instruct-v0.2-Q4_K_M-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf", temperature=0.1, max_new_tokens=512, context_window=2048, generate_kwargs={"top_p": 0.95}, model_kwargs={"n_gpu_layers": 1}, # Laisse 1 si CPU ) # ✅ Embedding open-source via Hugging Face embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") # ✅ Configuration du service IA service_context = ServiceContext.from_defaults( llm=llm, embed_model=embed_model ) try: # ✅ Découpage sémantique intelligent parser = SemanticSplitterNodeParser.from_defaults(service_context=service_context) nodes = parser.get_nodes_from_documents([Document(text=data.text)]) return { "chunks": [node.text for node in nodes], "metadatas": [node.metadata for node in nodes], "source_id": data.source_id, "titre": data.titre, "source": data.source, "type": data.type } except Exception as e: return {"error": str(e)}