adarsh-maurya commited on
Commit
a1f5731
·
verified ·
1 Parent(s): d883f65

Update Ingest.py

Browse files
Files changed (1) hide show
  1. Ingest.py +31 -50
Ingest.py CHANGED
@@ -5,7 +5,6 @@ from langchain_community.document_loaders import DirectoryLoader
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
8
- from faiss import IndexFlatL2 # Assuming using L2 distance for simplicity
9
 
10
  # Initialize Ray
11
  ray.init()
@@ -15,69 +14,51 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
15
 
16
  # Directory where the FAISS index is saved
17
  index_directory = 'ipc_embed_db'
18
- index_filename = 'index.faiss'
19
- index_path = os.path.join(index_directory, index_filename)
20
 
21
- # Function to create a new FAISS index if it doesn't exist
22
- def create_faiss_index(texts, embedding_function):
23
- # Create the FAISS index with L2 distance
24
- logging.info("Creating a new FAISS index...")
25
- index = IndexFlatL2(768) # Dimensionality of the embeddings
26
- docstore = {i: text for i, text in enumerate(texts)}
27
- index_to_docstore_id = {i: i for i in range(len(texts))}
28
- faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
29
 
30
- # Adding documents to the FAISS index
31
- logging.info("Adding documents to FAISS index...")
32
- for text in texts:
33
- embedding = embedding_function(text)
34
- faiss_db.add_documents([embedding])
 
 
 
 
 
 
 
 
35
 
36
- # Save the FAISS index to disk
37
- logging.info("Saving FAISS index to disk...")
 
 
38
  faiss_db.save_local(index_directory)
39
- logging.info("FAISS index saved successfully.")
40
  return faiss_db
41
 
42
- # Function to load an existing FAISS index
43
- def load_faiss_index(embedding_function):
44
- if os.path.exists(index_path):
45
  logging.info("Loading existing FAISS index...")
46
- faiss_db = FAISS.load_local(index_directory, embedding_function)
47
  logging.info("FAISS index loaded successfully.")
48
  return faiss_db
49
  else:
50
- logging.info("FAISS index not found, creating a new one...")
51
- return create_faiss_index(texts, embedding_function)
52
 
53
- # Load documents with logging
54
- logging.info("Loading documents...")
55
- loader = DirectoryLoader('data', glob="./*.txt")
56
- documents = loader.load()
57
-
58
- # Extract text from documents and split into manageable chunks
59
- logging.info("Extracting and splitting texts from documents...")
60
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
61
- texts = []
62
- for document in documents:
63
- if hasattr(document, 'get_text'):
64
- text_content = document.get_text() # Adjust according to actual method
65
- else:
66
- text_content = "" # Default to empty string if no text method is available
67
- texts.extend(text_splitter.split_text(text_content))
68
 
69
- # Define embedding function
70
- def embedding_function(text):
71
- embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
72
- return embeddings_model.embed_query(text)
73
-
74
- # Load or create the FAISS index dynamically
75
- faiss_db = load_faiss_index(embedding_function)
76
-
77
- # If you need to perform a search or interact with the FAISS index:
78
  # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
79
 
80
  # Shutdown Ray after the process
81
  ray.shutdown()
82
-
83
  logging.info("Process completed successfully.")
 
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
 
8
 
9
  # Initialize Ray
10
  ray.init()
 
14
 
15
  # Directory where the FAISS index is saved
16
  index_directory = 'ipc_embed_db'
17
+ index_path_faiss = os.path.join(index_directory, 'index.faiss')
18
+ index_path_pkl = os.path.join(index_directory, 'index.pkl')
19
 
20
+ # Ensure the index directory exists
21
+ os.makedirs(index_directory, exist_ok=True)
 
 
 
 
 
 
22
 
23
+ # Load documents
24
+ logging.info("Loading documents...")
25
+ loader = DirectoryLoader('data', glob="./*.txt")
26
+ documents = loader.load()
27
+
28
+ # Split documents into manageable chunks
29
+ logging.info("Splitting documents into chunks...")
30
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
31
+ texts = text_splitter.split_documents(documents)
32
+
33
+ # Load embedding model once
34
+ logging.info("Loading embedding model...")
35
+ embeddings = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
36
 
37
+ # Function to create and save FAISS index
38
+ def create_faiss_index():
39
+ logging.info("Creating new FAISS index from documents...")
40
+ faiss_db = FAISS.from_documents(texts, embeddings)
41
  faiss_db.save_local(index_directory)
42
+ logging.info("FAISS index created and saved.")
43
  return faiss_db
44
 
45
+ # Function to load or create FAISS index
46
+ def load_or_create_faiss_index():
47
+ if os.path.exists(index_path_faiss) and os.path.exists(index_path_pkl):
48
  logging.info("Loading existing FAISS index...")
49
+ faiss_db = FAISS.load_local(index_directory, embeddings, allow_dangerous_deserialization=True)
50
  logging.info("FAISS index loaded successfully.")
51
  return faiss_db
52
  else:
53
+ logging.info("FAISS index not found. Creating a new one...")
54
+ return create_faiss_index()
55
 
56
+ # Load or create the index
57
+ faiss_db = load_or_create_faiss_index()
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ # Optional: If you want to use the retriever later
 
 
 
 
 
 
 
 
60
  # db_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
61
 
62
  # Shutdown Ray after the process
63
  ray.shutdown()
 
64
  logging.info("Process completed successfully.")