Spaces:
Paused
Paused
File size: 784 Bytes
2e748b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
from sentence_transformers import SentenceTransformer
import numpy as np
EMBED_MODEL_NAME="sentence-transformers/all-MiniLM-L6-v2"
embedder=SentenceTransformer(EMBED_MODEL_NAME)
def embed_chunks(chunks):
embeddings=embedder.encode(chunks,convert_to_numpy=True,show_progress_bar=True)
return embeddings
if __name__=="__main__":
from step1_read_pdf import read_pdf
from step2_chunk import chunk_text
#Lecture du document
text=read_pdf("data/DST_Rapport_final_Reco_plant.pdf")
chunks =chunk_text(text,chunk_size=300,overlap=50)
#Embedding
embeddings=embed_chunks(chunks)
print(f"\n✅ Embeddings générés : {embeddings.shape[0]} vectors de {embeddings.shape[1]} dimensions")
print(f"Exemple (1er vecteur) :\n{embeddings[0][:5]}...")
|