File size: 1,447 Bytes
99afe26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import (
    PyPDFLoader,
    DirectoryLoader,
    UnstructuredFileLoader,
)
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import (
    OpenAIEmbeddings,
    HuggingFaceBgeEmbeddings,
    HuggingFaceEmbeddings,
    HuggingFaceInstructEmbeddings,
)


persist_directory = "stores/test_512"
data = "data\czech"
chunk = 512
overlap = 128
# embedding_model = "Seznam/simcse-dist-mpnet-czeng-cs-en"
embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"

model_name = embedding_model
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
embedding = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

"""
loader = CSVLoader(
    file_path="data/emails.csv",
    encoding="utf-8",
    csv_args={
        "delimiter": ";",
    },
)

"""

loader = DirectoryLoader(data, show_progress=True)


documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk,
    chunk_overlap=overlap,
)
texts = text_splitter.split_documents(documents)
vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embedding,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"},
)

print("\n Vector Store Created.......\n\n")