File size: 784 Bytes
2e748b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from sentence_transformers import SentenceTransformer
import numpy as np 

EMBED_MODEL_NAME="sentence-transformers/all-MiniLM-L6-v2"
embedder=SentenceTransformer(EMBED_MODEL_NAME)

def embed_chunks(chunks):
    embeddings=embedder.encode(chunks,convert_to_numpy=True,show_progress_bar=True)
    return embeddings

if __name__=="__main__":
    from step1_read_pdf import read_pdf
    from step2_chunk import chunk_text
    #Lecture du document
    text=read_pdf("data/DST_Rapport_final_Reco_plant.pdf")
    chunks =chunk_text(text,chunk_size=300,overlap=50)

    #Embedding 
    embeddings=embed_chunks(chunks)

    print(f"\n✅ Embeddings générés : {embeddings.shape[0]} vectors de {embeddings.shape[1]} dimensions")
    print(f"Exemple (1er vecteur) :\n{embeddings[0][:5]}...")