Spaces:
Sleeping
Sleeping
| import os | |
| from langchain.document_loaders import TextLoader, DirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import FAISS | |
| # Set your Hugging Face token | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| # Load documents | |
| loader = DirectoryLoader('data2/text/range/0-5000', loader_cls=TextLoader) | |
| documents = loader.load() | |
| print('len of documents are', len(documents)) | |
| # Split documents into chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250) | |
| all_splits = text_splitter.split_documents(documents) | |
| print("Length of all_splits:", len(all_splits)) | |
| # Generate embeddings | |
| model_name = "sentence-transformers/all-mpnet-base-v2" | |
| model_kwargs = {"device": "cuda"} | |
| embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs) | |
| # Store embeddings in the vector store | |
| vectorstore = FAISS.from_documents(all_splits, embeddings) | |
| vectorstore.save_local('faiss_index') | |
| print("Embeddings stored successfully!") | |