adarsh-maurya commited on
Commit
a965de4
·
verified ·
1 Parent(s): a31ab94

Update Ingest.py

Browse files
Files changed (1) hide show
  1. Ingest.py +25 -13
Ingest.py CHANGED
@@ -4,7 +4,7 @@ from langchain_community.document_loaders import DirectoryLoader
4
  from langchain_community.embeddings import HuggingFaceEmbeddings
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_community.vectorstores import FAISS
7
- from faiss import IndexFlatL2 # Assuming using L2 distance for simplicity
8
 
9
  # Initialize Ray
10
  ray.init()
@@ -17,24 +17,29 @@ logging.info("Loading documents...")
17
  loader = DirectoryLoader('data', glob="./*.txt")
18
  documents = loader.load()
19
 
20
- # Extract text from documents and split into manageable texts with logging
21
  logging.info("Extracting and splitting texts from documents...")
22
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
23
  texts = []
24
  for document in documents:
25
- if hasattr(document, 'get_text'):
26
- text_content = document.get_text() # Adjust according to actual method
27
- else:
28
- text_content = "" # Default to empty string if no text method is available
 
29
 
30
- texts.extend(text_splitter.split_text(text_content))
 
 
31
 
32
- # Define embedding function
 
 
 
33
  def embedding_function(text):
34
- embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
35
  return embeddings_model.embed_query(text)
36
 
37
- # Create FAISS index for embeddings
38
  index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed
39
 
40
  # Assuming docstore as a simple dictionary to store document texts
@@ -47,12 +52,19 @@ faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id)
47
  # Process and store embeddings
48
  logging.info("Storing embeddings in FAISS...")
49
  for i, text in enumerate(texts):
50
- embedding = embedding_function(text)
51
- faiss_db.add_documents([embedding])
 
 
 
52
 
53
  # Exporting the vector embeddings database with logging
54
  logging.info("Exporting the vector embeddings database...")
55
- faiss_db.save_local("ipc_embed_db")
 
 
 
 
56
 
57
  # Log a message to indicate the completion of the process
58
  logging.info("Process completed successfully.")
 
4
  from langchain_community.embeddings import HuggingFaceEmbeddings
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_community.vectorstores import FAISS
7
+ from faiss import IndexFlatL2
8
 
9
  # Initialize Ray
10
  ray.init()
 
17
  loader = DirectoryLoader('data', glob="./*.txt")
18
  documents = loader.load()
19
 
20
+ # Extract text from documents and split into manageable chunks with logging
21
  logging.info("Extracting and splitting texts from documents...")
22
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
23
  texts = []
24
  for document in documents:
25
+ try:
26
+ if hasattr(document, 'get_text'):
27
+ text_content = document.get_text() # Adjust according to actual method
28
+ else:
29
+ text_content = "" # Default to empty string if no text method is available
30
 
31
+ texts.extend(text_splitter.split_text(text_content))
32
+ except Exception as e:
33
+ logging.error(f"Error processing document {document}: {e}")
34
 
35
+ # Initialize embedding model once outside the loop
36
+ embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")
37
+
38
+ # Define embedding function (optimized to use pre-initialized model)
39
  def embedding_function(text):
 
40
  return embeddings_model.embed_query(text)
41
 
42
+ # Create FAISS index for embeddings (adjust dimension as needed)
43
  index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed
44
 
45
  # Assuming docstore as a simple dictionary to store document texts
 
52
  # Process and store embeddings
53
  logging.info("Storing embeddings in FAISS...")
54
  for i, text in enumerate(texts):
55
+ try:
56
+ embedding = embedding_function(text)
57
+ faiss_db.add_documents([embedding])
58
+ except Exception as e:
59
+ logging.error(f"Error embedding document {i}: {e}")
60
 
61
  # Exporting the vector embeddings database with logging
62
  logging.info("Exporting the vector embeddings database...")
63
+ try:
64
+ faiss_db.save_local("ipc_embed_db")
65
+ logging.info("Export completed successfully.")
66
+ except Exception as e:
67
+ logging.error(f"Error exporting FAISS database: {e}")
68
 
69
  # Log a message to indicate the completion of the process
70
  logging.info("Process completed successfully.")