chatbot-fr / sauve /step3_embed.py
rkonan's picture
reorganisation
236b637
raw
history blame contribute delete
784 Bytes
from sentence_transformers import SentenceTransformer
import numpy as np
EMBED_MODEL_NAME="sentence-transformers/all-MiniLM-L6-v2"
embedder=SentenceTransformer(EMBED_MODEL_NAME)
def embed_chunks(chunks):
embeddings=embedder.encode(chunks,convert_to_numpy=True,show_progress_bar=True)
return embeddings
if __name__=="__main__":
from step1_read_pdf import read_pdf
from step2_chunk import chunk_text
#Lecture du document
text=read_pdf("data/DST_Rapport_final_Reco_plant.pdf")
chunks =chunk_text(text,chunk_size=300,overlap=50)
#Embedding
embeddings=embed_chunks(chunks)
print(f"\n✅ Embeddings générés : {embeddings.shape[0]} vectors de {embeddings.shape[1]} dimensions")
print(f"Exemple (1er vecteur) :\n{embeddings[0][:5]}...")