File size: 3,865 Bytes
356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 356cc10 64e8ee9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import os
import logging
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_documents(docs_dir):
documents = []
for root, dirs, files in os.walk(docs_dir):
for file in files:
if file.endswith(".pdf"):
file_path = os.path.join(root, file)
logger.info(f"Loading document: {file_path}")
try:
loader = PyPDFLoader(file_path)
loaded_docs = loader.load()
if loaded_docs:
documents.extend(loaded_docs)
logger.info(f"Loaded {len(loaded_docs)} pages from {file_path}.")
else:
logger.warning(f"No content extracted from {file_path}. Possibly encrypted or empty.")
except Exception as e:
logger.error(f"Error loading {file_path}: {e}")
return documents
def split_text(documents):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
if not texts:
logger.error("No text chunks were created. Check the text splitting process.")
return None
logger.info(f"Created {len(texts)} text chunks.")
for i, text in enumerate(texts[:5]): # Sample first 5 chunks
logger.debug(f"Sample chunk {i}: {text[:100]}...") # Print first 100 characters
return texts
def create_embeddings():
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
try:
sample_embedding = embeddings.embed_query("sample text")
logger.debug(f"Sample embedding: {sample_embedding[:5]}... (truncated for brevity)")
except Exception as e:
logger.error(f"Error generating sample embedding: {e}")
return None
return embeddings
def create_faiss_index(texts, embeddings):
try:
db = FAISS.from_documents(texts, embeddings)
logger.info(f"Created FAISS index with {len(texts)} vectors")
# Directly check the FAISS index size
if len(db.index) > 0:
logger.info(f"FAISS index contains {len(db.index)} vectors.")
else:
logger.error("FAISS index contains 0 vectors after creation. Check the data and embeddings.")
except Exception as e:
logger.error(f"Failed to create FAISS index: {e}")
return None
return db
def save_faiss_index(db, index_path):
try:
db.save_local(index_path)
logger.info(f"FAISS index saved to {index_path}")
except Exception as e:
logger.error(f"Failed to save FAISS index to {index_path}: {e}")
def main():
docs_dir = "docs" # Adjust to your document directory
index_path = "faiss_index"
logger.info("Starting document processing...")
# Load documents
documents = load_documents(docs_dir)
if not documents:
logger.error("No documents were loaded. Exiting.")
return
# Split text into chunks
texts = split_text(documents)
if texts is None:
logger.error("Text splitting failed. Exiting.")
return
# Create embeddings
embeddings = create_embeddings()
if embeddings is None:
logger.error("Embeddings creation failed. Exiting.")
return
# Create FAISS index
db = create_faiss_index(texts, embeddings)
if db is None:
logger.error("FAISS index creation failed. Exiting.")
return
# Save FAISS index
save_faiss_index(db, index_path)
if __name__ == "__main__":
main()
|