AkashDataScience commited on
Commit
b2fe6e1
·
1 Parent(s): 82eb5b8

Modular code

Browse files
Files changed (1) hide show
  1. app.py +53 -13
app.py CHANGED
@@ -27,14 +27,18 @@ chain = prompt | gemini
27
 
28
  index_name = "langchain-test-index"
29
 
30
- def store_embeddings(pdf_path, chunk_size, chunk_overlap):
31
  raw_documents = []
32
  for path in pdf_path:
33
  raw_documents.extend(PyPDFLoader(path).load())
 
34
 
 
35
  text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
36
  documents = text_splitter.split_documents(raw_documents)
 
37
 
 
38
  pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
39
 
40
  index = pc.Index(host="https://langchain-test-index-la2n80y.svc.aped-4627-b74a.pinecone.io")
@@ -42,13 +46,53 @@ def store_embeddings(pdf_path, chunk_size, chunk_overlap):
42
  if index.describe_index_stats()['total_vector_count'] > 0:
43
  index.delete(delete_all=True)
44
 
 
45
  chroma_db = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db")
 
 
46
  faiss_db = FAISS.from_documents(documents, embeddings)
47
  faiss_db.save_local("./faiss_db")
 
 
48
  lance_db = LanceDB.from_documents(documents, embeddings, uri="./lance_db")
 
 
49
  pinecone_db = PineconeVectorStore.from_documents(documents, index_name=index_name,
50
  embedding=embeddings)
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  return "All embeddings are stored in vector database"
53
 
54
  title = "PDF Chat"
@@ -57,21 +101,17 @@ examples = [[["data/amazon-10-k-2024.pdf"], 1000, 100],
57
  [["data/goog-10-k-2023.pdf"], 1000, 100]]
58
 
59
  def inference(query):
60
- chroma_db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
61
- chroma_docs = chroma_db.similarity_search(query)
62
- chroma_answer = chain.invoke({"context":chroma_docs, "question": query}, return_only_outputs=True)
63
 
64
- faiss_db = FAISS.load_local("./faiss_db", embeddings, allow_dangerous_deserialization=True)
65
- faiss_docs = faiss_db.similarity_search(query)
66
- faiss_answer = chain.invoke({"context":faiss_docs, "question": query}, return_only_outputs=True)
67
 
68
- lance_db = LanceDB(embedding=embeddings, uri="./lance_db")
69
- lance_docs = lance_db.similarity_search(query)
70
- lance_answer = chain.invoke({"context":lance_docs, "question": query}, return_only_outputs=True)
71
 
72
- pinecone_db = PineconeVectorStore(index_name=index_name, embedding=embeddings)
73
- pinecone_docs = pinecone_db.similarity_search(query)
74
- pinecoce_answer = chain.invoke({"context":pinecone_docs, "question": query}, return_only_outputs=True)
75
 
76
  return chroma_answer, faiss_answer, lance_answer, pinecoce_answer
77
 
 
27
 
28
  index_name = "langchain-test-index"
29
 
30
+ def extract_text_from_pdf(pdf_path):
31
  raw_documents = []
32
  for path in pdf_path:
33
  raw_documents.extend(PyPDFLoader(path).load())
34
+ return raw_documents
35
 
36
+ def chunk_text(raw_documents):
37
  text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
38
  documents = text_splitter.split_documents(raw_documents)
39
+ return documents
40
 
41
+ def delete_pinecone():
42
  pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
43
 
44
  index = pc.Index(host="https://langchain-test-index-la2n80y.svc.aped-4627-b74a.pinecone.io")
 
46
  if index.describe_index_stats()['total_vector_count'] > 0:
47
  index.delete(delete_all=True)
48
 
49
+ def store_chroma_db(documents):
50
  chroma_db = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db")
51
+
52
+ def store_faiss_db(documents):
53
  faiss_db = FAISS.from_documents(documents, embeddings)
54
  faiss_db.save_local("./faiss_db")
55
+
56
+ def store_lance_db(documents):
57
  lance_db = LanceDB.from_documents(documents, embeddings, uri="./lance_db")
58
+
59
+ def store_pinecone_db(documents):
60
  pinecone_db = PineconeVectorStore.from_documents(documents, index_name=index_name,
61
  embedding=embeddings)
62
 
63
+ def load_chroma_db():
64
+ chroma_db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
65
+ return chroma_db
66
+
67
+ def load_faiss_db():
68
+ faiss_db = FAISS.load_local("./faiss_db", embeddings, allow_dangerous_deserialization=True)
69
+ return faiss_db
70
+
71
+ def load_lance_db():
72
+ lance_db = LanceDB(embedding=embeddings, uri="./lance_db")
73
+ return lance_db
74
+
75
+ def connect_pinecone_db():
76
+ pinecone_db = PineconeVectorStore(index_name=index_name, embedding=embeddings)
77
+ return pinecone_db
78
+
79
+ def invoke_chain(db, query):
80
+ docs = db.similarity_search(query)
81
+ answer = chain.invoke({"context":docs, "question": query}, return_only_outputs=True)
82
+ return answer
83
+
84
+ def store_embeddings(pdf_path, chunk_size, chunk_overlap):
85
+ raw_documents = extract_text_from_pdf(pdf_path)
86
+
87
+ documents = chunk_text(raw_documents)
88
+
89
+ delete_pinecone()
90
+
91
+ store_chroma_db(documents)
92
+ store_chroma_db(documents)
93
+ store_lance_db(documents)
94
+ store_pinecone_db(documents)
95
+
96
  return "All embeddings are stored in vector database"
97
 
98
  title = "PDF Chat"
 
101
  [["data/goog-10-k-2023.pdf"], 1000, 100]]
102
 
103
  def inference(query):
104
+ chroma_db = load_chroma_db()
105
+ chroma_answer = invoke_chain(chroma_db, query)
 
106
 
107
+ faiss_db = load_faiss_db()
108
+ faiss_answer = invoke_chain(faiss_db, query)
 
109
 
110
+ lance_db = load_lance_db()
111
+ lance_answer = invoke_chain(lance_db, query)
 
112
 
113
+ pinecone_db = connect_pinecone_db()
114
+ pinecoce_answer = invoke_chain(pinecone_db, query)
 
115
 
116
  return chroma_answer, faiss_answer, lance_answer, pinecoce_answer
117