chatbot-fr / sauve /step3_llamaindex_evol.py
rkonan's picture
reorganisation
236b637
import os
import pickle
import faiss
import numpy as np
from pathlib import Path
from tqdm import tqdm
import textwrap
from llama_index.readers.file import PDFReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex
# 📁 Paramètres
DOCS_DIR = "data"
VECTOR_DIR = "vectordb"
INDEX_FILE = os.path.join(VECTOR_DIR, "index.faiss")
CHUNKS_FILE = os.path.join(VECTOR_DIR, "chunks.pkl")
EMBEDDING_MODEL = "dangvantuan/french-document-embedding"
os.makedirs(VECTOR_DIR, exist_ok=True)
# 📥 Chargement manuel des PDF
print("📥 Lecture des fichiers PDF...")
reader = PDFReader()
documents = []
for pdf_path in Path(DOCS_DIR).glob("*.pdf"):
print(f" - 📄 {pdf_path.name}")
docs = reader.load_data(pdf_path) # ✅ CORRECTION : path au lieu de file
documents.extend(docs)
print(f"✅ {len(documents)} documents PDF chargés.")
# ✂️ Chunking par taille de tokens (plus stable que par phrases)
print("✂️ Chunking avec SentenceSplitter (512 tokens, overlap 64)...")
parser = SentenceSplitter(chunk_size=512, chunk_overlap=64)
nodes = parser.get_nodes_from_documents(documents)
print(f"✅ {len(nodes)} chunks générés.")
# 🔍 Aperçu des 5 premiers chunks
print("\n🧩 Aperçu des 5 premiers chunks :\n")
for i, node in enumerate(nodes[:5]):
preview = textwrap.shorten(node.get_content().replace("\n", " "), width=200)
print(f"Chunk {i+1:>2}: {preview}")
# 🔢 Embedding + FAISS
print("\n🔢 Génération des embeddings et indexation FAISS...")
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL, trust_remote_code=True)
embedding_dim = np.array(embed_model.get_query_embedding("test")).shape[0]
faiss_index = faiss.IndexFlatL2(embedding_dim)
vector_store = FaissVectorStore(faiss_index=faiss_index)
# 🧠 Construction de l’index vectoriel
index = VectorStoreIndex(nodes, embed_model=embed_model, vector_store=vector_store)
# 💾 Sauvegarde
print("💾 Sauvegarde de l’index et des chunks...")
faiss.write_index(faiss_index, INDEX_FILE)
chunks = [node.get_content() for node in nodes]
with open(CHUNKS_FILE, "wb") as f:
pickle.dump(chunks, f)
print(f"\n✅ {len(chunks)} chunks sauvegardés dans {CHUNKS_FILE}")