Spaces:
Paused
Paused
import os | |
import pickle | |
import faiss | |
import numpy as np | |
from pathlib import Path | |
from tqdm import tqdm | |
import textwrap | |
from llama_index.readers.file import PDFReader | |
from llama_index.core.node_parser import SentenceSplitter | |
from llama_index.vector_stores.faiss import FaissVectorStore | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.core import VectorStoreIndex | |
# 📁 Paramètres | |
DOCS_DIR = "data" | |
VECTOR_DIR = "vectordb" | |
INDEX_FILE = os.path.join(VECTOR_DIR, "index.faiss") | |
CHUNKS_FILE = os.path.join(VECTOR_DIR, "chunks.pkl") | |
EMBEDDING_MODEL = "dangvantuan/french-document-embedding" | |
os.makedirs(VECTOR_DIR, exist_ok=True) | |
# 📥 Chargement manuel des PDF | |
print("📥 Lecture des fichiers PDF...") | |
reader = PDFReader() | |
documents = [] | |
for pdf_path in Path(DOCS_DIR).glob("*.pdf"): | |
print(f" - 📄 {pdf_path.name}") | |
docs = reader.load_data(pdf_path) # ✅ CORRECTION : path au lieu de file | |
documents.extend(docs) | |
print(f"✅ {len(documents)} documents PDF chargés.") | |
# ✂️ Chunking par taille de tokens (plus stable que par phrases) | |
print("✂️ Chunking avec SentenceSplitter (512 tokens, overlap 64)...") | |
parser = SentenceSplitter(chunk_size=512, chunk_overlap=64) | |
nodes = parser.get_nodes_from_documents(documents) | |
print(f"✅ {len(nodes)} chunks générés.") | |
# 🔍 Aperçu des 5 premiers chunks | |
print("\n🧩 Aperçu des 5 premiers chunks :\n") | |
for i, node in enumerate(nodes[:5]): | |
preview = textwrap.shorten(node.get_content().replace("\n", " "), width=200) | |
print(f"Chunk {i+1:>2}: {preview}") | |
# 🔢 Embedding + FAISS | |
print("\n🔢 Génération des embeddings et indexation FAISS...") | |
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL, trust_remote_code=True) | |
embedding_dim = np.array(embed_model.get_query_embedding("test")).shape[0] | |
faiss_index = faiss.IndexFlatL2(embedding_dim) | |
vector_store = FaissVectorStore(faiss_index=faiss_index) | |
# 🧠 Construction de l’index vectoriel | |
index = VectorStoreIndex(nodes, embed_model=embed_model, vector_store=vector_store) | |
# 💾 Sauvegarde | |
print("💾 Sauvegarde de l’index et des chunks...") | |
faiss.write_index(faiss_index, INDEX_FILE) | |
chunks = [node.get_content() for node in nodes] | |
with open(CHUNKS_FILE, "wb") as f: | |
pickle.dump(chunks, f) | |
print(f"\n✅ {len(chunks)} chunks sauvegardés dans {CHUNKS_FILE}") | |