File size: 1,205 Bytes
b9c7274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from pathlib import Path
from typing import List

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings

import config

embeddings_model = OpenAIEmbeddings()


def process_documents(doc_storage_path: str):
    print("doc preprocessing...")
    doc_directory = Path(doc_storage_path)
    docs = []  # type: List[Document]
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=config.CHUNK_SIZE, chunk_overlap=config.CHUNK_OVERLAP
    )
    doc_search = Chroma(
        persist_directory=config.STORE_FILE, embedding_function=embeddings_model
    )
    for file_path in doc_directory.glob("*.txt"):
        print(str(file_path))
        loader = TextLoader(str(file_path))
        documents = loader.load()
        docs = text_splitter.split_documents(documents)
        doc_search.from_documents(docs, embeddings_model)
        print(len(docs))
    print("doc preprocessing end.")
    return doc_search


def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])