Spaces:
Runtime error
Runtime error
| #importing dependencies | |
| from langchain.embeddings import HuggingFaceBgeEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import Chroma | |
| from langchain.document_loaders import PyPDFDirectoryLoader | |
| import time | |
| #loading data | |
| loader = PyPDFDirectoryLoader('data/') | |
| documents = loader.load() | |
| print(len(documents)) | |
| #splitting | |
| splitter = RecursiveCharacterTextSplitter(chunk_size = 10000, chunk_overlap = 500) | |
| text_chunks = splitter.split_documents(documents) | |
| print(len(text_chunks)) | |
| #loading HuggingFaceBGE embeddings | |
| model_name = "BAAI/bge-small-en" | |
| model_kwargs = {"device": "cpu"} | |
| encode_kwargs = {"normalize_embeddings": True} | |
| embeddings = HuggingFaceBgeEmbeddings( | |
| model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
| ) | |
| print('Embeddings loaded!') | |
| # creating NCERT Textbooks vector database. | |
| t1 = time.time() | |
| persist_directory = 'dbname' | |
| vectordb = Chroma.from_documents( | |
| documents = text_chunks, | |
| embedding = embeddings, | |
| collection_metadata = {"hnsw:space": "cosine"}, | |
| persist_directory = persist_directory | |
| ) | |
| t2 = time.time() | |
| print('Time taken for building db : ', (t2 - t1)) | |