|
from langchain.text_splitter import CharacterTextSplitter
|
|
from langchain.vectorstores import FAISS
|
|
from langchain.embeddings import HuggingFaceEmbeddings
|
|
from langchain.schema import Document
|
|
|
|
def prepare_documents(text: str, chunk_size=1000, chunk_overlap=200):
|
|
"""
|
|
Splits the combined log text into smaller chunks using LangChain's splitter,
|
|
so they can be processed and embedded efficiently.
|
|
"""
|
|
docs = [Document(page_content=text)]
|
|
splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
return splitter.split_documents(docs)
|
|
|
|
def create_vectorstore(documents, model_name="sentence-transformers/all-MiniLM-L6-v2"):
|
|
"""
|
|
Uses Hugging Face Transformers to embed the document chunks,
|
|
and stores them in a FAISS vector database for fast retrieval.
|
|
"""
|
|
embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
|
return FAISS.from_documents(documents, embeddings)
|
|
|