from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from langchain.schema import Document def prepare_documents(text: str, chunk_size=1000, chunk_overlap=200): """ Splits the combined log text into smaller chunks using LangChain's splitter, so they can be processed and embedded efficiently. """ docs = [Document(page_content=text)] # Wrap raw text in a Document splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) return splitter.split_documents(docs) def create_vectorstore(documents, model_name="sentence-transformers/all-MiniLM-L6-v2"): """ Uses Hugging Face Transformers to embed the document chunks, and stores them in a FAISS vector database for fast retrieval. """ embeddings = HuggingFaceEmbeddings(model_name=model_name) return FAISS.from_documents(documents, embeddings)