Spaces:
Runtime error
Runtime error
File size: 1,447 Bytes
99afe26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import (
PyPDFLoader,
DirectoryLoader,
UnstructuredFileLoader,
)
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import (
OpenAIEmbeddings,
HuggingFaceBgeEmbeddings,
HuggingFaceEmbeddings,
HuggingFaceInstructEmbeddings,
)
persist_directory = "stores/test_512"
data = "data\czech"
chunk = 512
overlap = 128
# embedding_model = "Seznam/simcse-dist-mpnet-czeng-cs-en"
embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
model_name = embedding_model
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
embedding = HuggingFaceEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
"""
loader = CSVLoader(
file_path="data/emails.csv",
encoding="utf-8",
csv_args={
"delimiter": ";",
},
)
"""
loader = DirectoryLoader(data, show_progress=True)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk,
chunk_overlap=overlap,
)
texts = text_splitter.split_documents(documents)
vectordb = Chroma.from_documents(
documents=texts,
embedding=embedding,
persist_directory=persist_directory,
collection_metadata={"hnsw:space": "cosine"},
)
print("\n Vector Store Created.......\n\n")
|