rahimizadeh's picture
Update modules/vectorizer.py
e0aa1ea verified
raw
history blame contribute delete
824 Bytes
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
def prepare_documents(text: str, chunk_size=1000, chunk_overlap=200):
"""
Splits long log text into smaller chunks for embedding.
"""
docs = [Document(page_content=text)]
splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
return splitter.split_documents(docs)
def create_vectorstore(documents, model_name="sentence-transformers/all-MiniLM-L6-v2"):
"""
Embeds chunks and stores them in a FAISS vector DB for retrieval.
"""
embeddings = HuggingFaceEmbeddings(model_name=model_name)
return FAISS.from_documents(documents, embeddings)