|
import os |
|
from src.utils.pdf_splitter import DataExtractor |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.vectorstores import FAISS |
|
|
|
class VectorDatabase: |
|
def __init__(self, db_name): |
|
self.db_name = db_name |
|
self.persist_directory = os.path.join("vector_embedding", self.db_name) |
|
|
|
|
|
self.embeddings = HuggingFaceEmbeddings( |
|
model_name="sentence-transformers/all-MiniLM-L6-v2", |
|
model_kwargs={"device": "cpu"}, |
|
encode_kwargs={ |
|
"padding": "max_length", |
|
"max_length": 512, |
|
"truncation": True, |
|
"normalize_embeddings": True |
|
} |
|
) |
|
|
|
def create_db(self, pdf_data): |
|
|
|
self.vectDB = FAISS.from_documents( |
|
documents=pdf_data, |
|
embedding=self.embeddings |
|
) |
|
self.vectDB.save_local(self.persist_directory) |
|
|
|
|
|
def main(): |
|
pdf_directory = './data/mental_health' |
|
data_extractor = DataExtractor(pdf_directory) |
|
text_data = data_extractor.extract_text() |
|
text_data = data_extractor.clean_and_split_text(text_data) |
|
|
|
|
|
vector_db = VectorDatabase(db_name="mental_health_vector_db") |
|
vector_db.create_db(text_data) |
|
print("Vector embeddings have been generated and loaded successfully.") |
|
|
|
if __name__ == "__main__": |
|
main() |