File size: 993 Bytes
3a5abb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

def prepare_documents(text: str, chunk_size=1000, chunk_overlap=200):
    """

    Splits the combined log text into smaller chunks using LangChain's splitter,

    so they can be processed and embedded efficiently.

    """
    docs = [Document(page_content=text)]  # Wrap raw text in a Document
    splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)

def create_vectorstore(documents, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """

    Uses Hugging Face Transformers to embed the document chunks,

    and stores them in a FAISS vector database for fast retrieval.

    """
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return FAISS.from_documents(documents, embeddings)