jarif commited on
Commit
e3c633c
·
verified ·
1 Parent(s): 5d7e9c6

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +124 -124
  2. ingest.py +99 -99
app.py CHANGED
@@ -1,124 +1,124 @@
1
- import streamlit as st
2
- import os
3
- import faiss
4
- import logging
5
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
- from langchain_community.embeddings import HuggingFaceEmbeddings
7
- from langchain_community.vectorstores import FAISS
8
- from langchain_community.llms import HuggingFacePipeline
9
- from langchain.chains import RetrievalQA
10
- from ingest import create_faiss_index_from_pdfs
11
-
12
- # Set up logging
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger(__name__)
15
-
16
- checkpoint = "LaMini-T5-738M"
17
-
18
- @st.cache_resource
19
- def load_llm():
20
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
21
- model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
22
- pipe = pipeline(
23
- 'text2text-generation',
24
- model=model,
25
- tokenizer=tokenizer,
26
- max_length=256,
27
- do_sample=True,
28
- temperature=0.3,
29
- top_p=0.95
30
- )
31
- return HuggingFacePipeline(pipeline=pipe)
32
-
33
- def validate_index_file(index_path):
34
- try:
35
- with open(index_path, 'rb') as f:
36
- data = f.read(100)
37
- logger.info(f"Successfully read {len(data)} bytes from the index file")
38
- return True
39
- except Exception as e:
40
- logger.error(f"Error validating index file: {e}")
41
- return False
42
-
43
- def load_faiss_index():
44
- index_path = "faiss_index/index.faiss"
45
- if not os.path.exists(index_path):
46
- st.warning("Index file not found. Creating a new one...")
47
- create_faiss_index_from_pdfs()
48
-
49
- if not os.path.exists(index_path):
50
- st.error("Failed to create the FAISS index. Please check the 'docs' directory and try again.")
51
- raise RuntimeError("FAISS index creation failed.")
52
-
53
- try:
54
- index = faiss.read_index(index_path)
55
- if index is None:
56
- raise ValueError("Failed to read FAISS index.")
57
-
58
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
59
- db = FAISS.load_local("faiss_index", embeddings)
60
- if db.index is None or db.index_to_docstore_id is None:
61
- raise ValueError("FAISS index or docstore_id mapping is None.")
62
-
63
- return db.as_retriever()
64
- except Exception as e:
65
- st.error(f"Failed to load FAISS index: {e}")
66
- logger.exception("Exception in load_faiss_index")
67
- raise
68
-
69
- def process_answer(instruction):
70
- try:
71
- retriever = load_faiss_index()
72
- llm = load_llm()
73
- qa = RetrievalQA.from_chain_type(
74
- llm=llm,
75
- chain_type="stuff",
76
- retriever=retriever,
77
- return_source_documents=True
78
- )
79
- generated_text = qa.invoke(instruction)
80
- answer = generated_text['result']
81
- return answer, generated_text
82
- except Exception as e:
83
- st.error(f"An error occurred while processing the answer: {e}")
84
- logger.exception("Exception in process_answer")
85
- return "An error occurred while processing your request.", {}
86
-
87
- def diagnose_faiss_index():
88
- index_path = "faiss_index/index.faiss"
89
- if os.path.exists(index_path):
90
- st.write(f"Index file size: {os.path.getsize(index_path)} bytes")
91
- st.write(f"Index file permissions: {oct(os.stat(index_path).st_mode)[-3:]}")
92
- st.write(f"Index file owner: {os.stat(index_path).st_uid}")
93
- st.write(f"Current process user ID: {os.getuid()}")
94
- validate_index_file(index_path)
95
- else:
96
- st.warning("Index file does not exist.")
97
-
98
- def main():
99
- st.title("Search Your PDF 📚📝")
100
-
101
- with st.expander("About the App"):
102
- st.markdown(
103
- """
104
- This is a Generative AI powered Question and Answering app that responds to questions about your PDF File.
105
- """
106
- )
107
-
108
- diagnose_faiss_index()
109
-
110
- question = st.text_area("Enter your Question")
111
-
112
- if st.button("Ask"):
113
- st.info("Your Question: " + question)
114
- st.info("Your Answer")
115
- try:
116
- answer, metadata = process_answer(question)
117
- st.write(answer)
118
- st.write(metadata)
119
- except Exception as e:
120
- st.error(f"An unexpected error occurred: {e}")
121
- logger.exception("Unexpected error in main function")
122
-
123
- if __name__ == '__main__':
124
- main()
 
1
+ import streamlit as st
2
+ import os
3
+ import faiss
4
+ import logging
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_community.llms import HuggingFacePipeline
9
+ from langchain.chains import RetrievalQA
10
+ from ingest import create_faiss_index_from_pdfs
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ checkpoint = "LaMini-T5-738M"
17
+
18
+ @st.cache_resource
19
+ def load_llm():
20
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
21
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
22
+ pipe = pipeline(
23
+ 'text2text-generation',
24
+ model=model,
25
+ tokenizer=tokenizer,
26
+ max_length=256,
27
+ do_sample=True,
28
+ temperature=0.3,
29
+ top_p=0.95
30
+ )
31
+ return HuggingFacePipeline(pipeline=pipe)
32
+
33
+ def validate_index_file(index_path):
34
+ try:
35
+ with open(index_path, 'rb') as f:
36
+ data = f.read(100)
37
+ logger.info(f"Successfully read {len(data)} bytes from the index file")
38
+ return True
39
+ except Exception as e:
40
+ logger.error(f"Error validating index file: {e}")
41
+ return False
42
+
43
+ def load_faiss_index():
44
+ index_path = "faiss_index/index.faiss"
45
+ if not os.path.exists(index_path):
46
+ st.warning("Index file not found. Creating a new one...")
47
+ create_faiss_index_from_pdfs()
48
+
49
+ if not os.path.exists(index_path):
50
+ st.error("Failed to create the FAISS index. Please check the 'docs' directory and try again.")
51
+ raise RuntimeError("FAISS index creation failed.")
52
+
53
+ try:
54
+ index = faiss.read_index(index_path)
55
+ if index is None:
56
+ raise ValueError("Failed to read FAISS index.")
57
+
58
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
59
+ db = FAISS.load_local("faiss_index", embeddings)
60
+ if db.index is None or db.index_to_docstore_id is None:
61
+ raise ValueError("FAISS index or docstore_id mapping is None.")
62
+
63
+ return db.as_retriever()
64
+ except Exception as e:
65
+ st.error(f"Failed to load FAISS index: {e}")
66
+ logger.exception("Exception in load_faiss_index")
67
+ raise
68
+
69
+ def process_answer(instruction):
70
+ try:
71
+ retriever = load_faiss_index()
72
+ llm = load_llm()
73
+ qa = RetrievalQA.from_chain_type(
74
+ llm=llm,
75
+ chain_type="stuff",
76
+ retriever=retriever,
77
+ return_source_documents=True
78
+ )
79
+ generated_text = qa.invoke(instruction)
80
+ answer = generated_text['result']
81
+ return answer, generated_text
82
+ except Exception as e:
83
+ st.error(f"An error occurred while processing the answer: {e}")
84
+ logger.exception("Exception in process_answer")
85
+ return "An error occurred while processing your request.", {}
86
+
87
+ def diagnose_faiss_index():
88
+ index_path = "faiss_index/index.faiss"
89
+ if os.path.exists(index_path):
90
+ st.write(f"Index file size: {os.path.getsize(index_path)} bytes")
91
+ st.write(f"Index file permissions: {oct(os.stat(index_path).st_mode)[-3:]}")
92
+ st.write(f"Index file owner: {os.stat(index_path).st_uid}")
93
+ st.write(f"Current process user ID: {os.getuid()}")
94
+ validate_index_file(index_path)
95
+ else:
96
+ st.warning("Index file does not exist.")
97
+
98
+ def main():
99
+ st.title("Search Your PDF 📚📝")
100
+
101
+ with st.expander("About the App"):
102
+ st.markdown(
103
+ """
104
+ This is a Generative AI powered Question and Answering app that responds to questions about your PDF File.
105
+ """
106
+ )
107
+
108
+ diagnose_faiss_index()
109
+
110
+ question = st.text_area("Enter your Question")
111
+
112
+ if st.button("Ask"):
113
+ st.info("Your Question: " + question)
114
+ st.info("Your Answer")
115
+ try:
116
+ answer, metadata = process_answer(question)
117
+ st.write(answer)
118
+ st.write(metadata)
119
+ except Exception as e:
120
+ st.error(f"An unexpected error occurred: {e}")
121
+ logger.exception("Unexpected error in main function")
122
+
123
+ if __name__ == '__main__':
124
+ main()
ingest.py CHANGED
@@ -1,99 +1,99 @@
1
- import os
2
- import logging
3
- from langchain_community.document_loaders import PDFMinerLoader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
- from langchain_community.vectorstores import FAISS
7
-
8
- # Set up logging
9
- logging.basicConfig(level=logging.INFO)
10
- logger = logging.getLogger(__name__)
11
-
12
- def create_faiss_index(texts, embeddings):
13
- """
14
- Create a FAISS index from text chunks and embeddings.
15
- :param texts: List of text chunks.
16
- :param embeddings: HuggingFaceEmbeddings object.
17
- :return: FAISS index object.
18
- """
19
- try:
20
- db = FAISS.from_documents(texts, embeddings)
21
- logger.info(f"Created FAISS index with {len(texts)} vectors")
22
- # Check the FAISS index size
23
- if len(db.index) > 0:
24
- logger.info(f"FAISS index contains {len(db.index)} vectors.")
25
- else:
26
- logger.error("FAISS index contains 0 vectors after creation. Check the data and embeddings.")
27
- except Exception as e:
28
- logger.error(f"Failed to create FAISS index: {e}")
29
- return None
30
-
31
- return db
32
-
33
- def save_faiss_index(db, index_path):
34
- """
35
- Save the FAISS index to a specified path.
36
- :param db: FAISS index object.
37
- :param index_path: Path to save the index.
38
- """
39
- try:
40
- db.save_local(index_path)
41
- # Check the file size
42
- index_file_path = os.path.join(index_path, "index.faiss")
43
- file_size = os.path.getsize(index_file_path)
44
- logger.info(f"FAISS index saved to {index_path}")
45
- logger.info(f"Index file size: {file_size} bytes")
46
- if file_size == 0:
47
- logger.error(f"Index file '{index_file_path}' is empty.")
48
- except Exception as e:
49
- logger.error(f"Failed to save FAISS index to {index_path}: {e}")
50
-
51
- def create_faiss_index_from_pdfs():
52
- documents = []
53
- docs_dir = "docs"
54
-
55
- if not os.path.exists(docs_dir):
56
- logger.error(f"The directory '{docs_dir}' does not exist.")
57
- return
58
-
59
- for root, dirs, files in os.walk(docs_dir):
60
- for file in files:
61
- if file.endswith(".pdf"):
62
- file_path = os.path.join(root, file)
63
- logger.info(f"Loading document: {file_path}")
64
- try:
65
- loader = PDFMinerLoader(file_path)
66
- documents.extend(loader.load())
67
- except Exception as e:
68
- logger.error(f"Error loading {file_path}: {e}")
69
-
70
- if not documents:
71
- logger.error("No documents were loaded. Check the 'docs' directory and file paths.")
72
- return
73
-
74
- logger.info(f"Loaded {len(documents)} documents.")
75
-
76
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
77
- texts = text_splitter.split_documents(documents)
78
-
79
- if not texts:
80
- logger.error("No text chunks were created. Check the text splitting process.")
81
- return
82
-
83
- logger.info(f"Created {len(texts)} text chunks.")
84
-
85
- try:
86
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
87
- except Exception as e:
88
- logger.error(f"Failed to initialize embeddings: {e}")
89
- return
90
-
91
- db = create_faiss_index(texts, embeddings)
92
- if db:
93
- index_dir = "faiss_index"
94
- if not os.path.exists(index_dir):
95
- os.makedirs(index_dir)
96
- save_faiss_index(db, index_dir)
97
-
98
- if __name__ == "__main__":
99
- create_faiss_index_from_pdfs()
 
1
+ import os
2
+ import logging
3
+ from langchain_community.document_loaders import PDFMinerLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ def create_faiss_index(texts, embeddings):
13
+ """
14
+ Create a FAISS index from text chunks and embeddings.
15
+ :param texts: List of text chunks.
16
+ :param embeddings: HuggingFaceEmbeddings object.
17
+ :return: FAISS index object.
18
+ """
19
+ try:
20
+ db = FAISS.from_documents(texts, embeddings)
21
+ logger.info(f"Created FAISS index with {len(texts)} vectors")
22
+ # Check the FAISS index size
23
+ if len(db.index) > 0:
24
+ logger.info(f"FAISS index contains {len(db.index)} vectors.")
25
+ else:
26
+ logger.error("FAISS index contains 0 vectors after creation. Check the data and embeddings.")
27
+ except Exception as e:
28
+ logger.error(f"Failed to create FAISS index: {e}")
29
+ return None
30
+
31
+ return db
32
+
33
+ def save_faiss_index(db, index_path):
34
+ """
35
+ Save the FAISS index to a specified path.
36
+ :param db: FAISS index object.
37
+ :param index_path: Path to save the index.
38
+ """
39
+ try:
40
+ db.save_local(index_path)
41
+ # Check the file size
42
+ index_file_path = os.path.join(index_path, "index.faiss")
43
+ file_size = os.path.getsize(index_file_path)
44
+ logger.info(f"FAISS index saved to {index_path}")
45
+ logger.info(f"Index file size: {file_size} bytes")
46
+ if file_size == 0:
47
+ logger.error(f"Index file '{index_file_path}' is empty.")
48
+ except Exception as e:
49
+ logger.error(f"Failed to save FAISS index to {index_path}: {e}")
50
+
51
+ def create_faiss_index_from_pdfs():
52
+ documents = []
53
+ docs_dir = "docs"
54
+
55
+ if not os.path.exists(docs_dir):
56
+ logger.error(f"The directory '{docs_dir}' does not exist.")
57
+ return
58
+
59
+ for root, dirs, files in os.walk(docs_dir):
60
+ for file in files:
61
+ if file.endswith(".pdf"):
62
+ file_path = os.path.join(root, file)
63
+ logger.info(f"Loading document: {file_path}")
64
+ try:
65
+ loader = PDFMinerLoader(file_path)
66
+ documents.extend(loader.load())
67
+ except Exception as e:
68
+ logger.error(f"Error loading {file_path}: {e}")
69
+
70
+ if not documents:
71
+ logger.error("No documents were loaded. Check the 'docs' directory and file paths.")
72
+ return
73
+
74
+ logger.info(f"Loaded {len(documents)} documents.")
75
+
76
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
77
+ texts = text_splitter.split_documents(documents)
78
+
79
+ if not texts:
80
+ logger.error("No text chunks were created. Check the text splitting process.")
81
+ return
82
+
83
+ logger.info(f"Created {len(texts)} text chunks.")
84
+
85
+ try:
86
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
87
+ except Exception as e:
88
+ logger.error(f"Failed to initialize embeddings: {e}")
89
+ return
90
+
91
+ db = create_faiss_index(texts, embeddings)
92
+ if db:
93
+ index_dir = "faiss_index"
94
+ if not os.path.exists(index_dir):
95
+ os.makedirs(index_dir)
96
+ save_faiss_index(db, index_dir)
97
+
98
+ if __name__ == "__main__":
99
+ create_faiss_index_from_pdfs()