File size: 2,837 Bytes
575af7a 6cdbf89 575af7a c2d2148 575af7a c2d2148 575af7a 6cdbf89 575af7a c2d2148 575af7a c2d2148 575af7a c2d2148 575af7a c2d2148 575af7a c2d2148 575af7a c2d2148 575af7a c2d2148 575af7a c2d2148 6cdbf89 575af7a 6cdbf89 575af7a 6cdbf89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
import logging
import faiss
from langchain_community.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def create_faiss_index():
documents = []
docs_dir = "docs" # Directory where PDF files are stored
# Check if the 'docs' directory exists
if not os.path.exists(docs_dir):
logger.error(f"The directory '{docs_dir}' does not exist.")
return
# Walk through the 'docs' directory and load PDF files
for root, dirs, files in os.walk(docs_dir):
for file in files:
if file.endswith(".pdf"):
file_path = os.path.join(root, file)
logger.info(f"Loading document: {file_path}")
try:
loader = PDFMinerLoader(file_path)
loaded_docs = loader.load()
if loaded_docs:
logger.info(f"Loaded {len(loaded_docs)} documents from {file_path}")
documents.extend(loaded_docs)
else:
logger.warning(f"No documents loaded from {file_path}")
except Exception as e:
logger.error(f"Error loading {file_path}: {e}")
# Check if any documents were loaded
if not documents:
logger.error("No documents were loaded. Check the 'docs' directory and file paths.")
return
logger.info(f"Loaded {len(documents)} documents.")
# Split documents into text chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
logger.info(f"Created {len(texts)} text chunks.")
# Check if text chunks were created
if not texts:
logger.error("No text chunks created. Check the text splitting process.")
return
try:
# Initialize embeddings using HuggingFace models
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
logger.info("Embeddings initialized successfully.")
except Exception as e:
logger.error(f"Failed to initialize embeddings: {e}")
return
try:
# Create a FAISS index and save it
index = faiss.IndexFlatL2(embeddings.embedding_size)
vector_store = FAISS.from_documents(texts, embeddings, index)
vector_store.save_local("faiss_index")
logger.info(f"Created FAISS index with {len(texts)} vectors.")
except Exception as e:
logger.error(f"Failed to create FAISS index: {e}")
if __name__ == "__main__":
create_faiss_index()
|