DebabrataHalder commited on
Commit
4b2c780
·
verified ·
1 Parent(s): b86dd1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -16
app.py CHANGED
@@ -1,15 +1,18 @@
1
  import os
2
- import time
3
  import logging
4
  from dotenv import load_dotenv
5
  import streamlit as st
6
  from PyPDF2 import PdfReader
7
  from langchain.text_splitter import CharacterTextSplitter
8
  from langchain_cohere import CohereEmbeddings
9
- from langchain.vectorstores import FAISS
10
  from langchain.memory import ConversationBufferMemory
11
  from langchain.chains import ConversationalRetrievalChain
12
  from langchain_groq import ChatGroq
 
 
 
 
13
 
14
  # Load environment variables
15
  load_dotenv()
@@ -40,23 +43,26 @@ def get_text_chunks(text):
40
  chunks = text_splitter.split_text(text)
41
  return chunks
42
 
43
- # Function to create a FAISS vectorstore with throttling
44
  def get_vectorstore(text_chunks):
45
  cohere_api_key = os.getenv("COHERE_API_KEY")
46
  embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
47
- vectorstore = FAISS()
48
-
49
- batch_size = 10 # Number of chunks to process per batch
50
- for i in range(0, len(text_chunks), batch_size):
51
- batch = text_chunks[i:i + batch_size]
52
- try:
53
- vectors = embeddings.embed_documents(batch)
54
- vectorstore.add_texts(texts=batch, embeddings=vectors)
55
- logging.info(f"Processed batch {i // batch_size + 1}")
56
- except Exception as e:
57
- logging.error(f"Error processing batch {i // batch_size + 1}: {e}")
58
- time.sleep(1.5) # Sleep to avoid exceeding rate limit
59
-
 
 
 
60
  return vectorstore
61
 
62
  # Function to set up the conversational retrieval chain
 
1
  import os
 
2
  import logging
3
  from dotenv import load_dotenv
4
  import streamlit as st
5
  from PyPDF2 import PdfReader
6
  from langchain.text_splitter import CharacterTextSplitter
7
  from langchain_cohere import CohereEmbeddings
8
+ from langchain_community.vectorstores import FAISS
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
  from langchain_groq import ChatGroq
12
+ from langchain_core.docstore import InMemoryDocstore
13
+ import faiss
14
+ from uuid import uuid4
15
+ from langchain_core.documents import Document
16
 
17
  # Load environment variables
18
  load_dotenv()
 
43
  chunks = text_splitter.split_text(text)
44
  return chunks
45
 
46
+ # Function to create a FAISS vectorstore
47
  def get_vectorstore(text_chunks):
48
  cohere_api_key = os.getenv("COHERE_API_KEY")
49
  embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
50
+
51
+ # Initialize FAISS index
52
+ embedding_size = len(embeddings.embed_query("sample text"))
53
+ index = faiss.IndexFlatL2(embedding_size)
54
+ vectorstore = FAISS(
55
+ embedding_function=embeddings,
56
+ index=index,
57
+ docstore=InMemoryDocstore(),
58
+ index_to_docstore_id={}
59
+ )
60
+
61
+ # Add documents to the vectorstore
62
+ documents = [Document(page_content=chunk) for chunk in text_chunks]
63
+ ids = [str(uuid4()) for _ in documents]
64
+ vectorstore.add_documents(documents=documents, ids=ids)
65
+
66
  return vectorstore
67
 
68
  # Function to set up the conversational retrieval chain