baderanas commited on
Commit
cdf244e
·
verified ·
1 Parent(s): 0b87e1e

Upload 12 files

Browse files
.gitignore ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
README.md CHANGED
@@ -1,10 +1 @@
1
- ---
2
- title: Rag Medical
3
- emoji: 🔥
4
- colorFrom: purple
5
- colorTo: gray
6
- sdk: static
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # albariqi-rag
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from chroma_operations.ingestion import ingest
4
+ from rag import ask_question
5
+ from chroma_operations.delete_chroma import remove_from_chroma
6
+ import json
7
+
8
+
9
+ # Get list of processed PDF file names without `.pdf`
10
+ def get_processed_file_names(folder_path="docs/processed"):
11
+ try:
12
+ files = os.listdir(folder_path)
13
+ pdfs = [f[:-4] for f in files if f.endswith(".pdf")]
14
+ return sorted(pdfs)
15
+ except Exception as e:
16
+ st.error(f"Error reading folder: {e}")
17
+ return []
18
+
19
+
20
+ st.set_page_config(page_title="RAG Demo", layout="centered")
21
+ st.title("📄 Retrieval-Augmented Generation (RAG) Demo")
22
+
23
+ # Create tabs for different functionalities
24
+ tab1, tab2, tab3 = st.tabs(["Ask Questions", "Upload Documents", "Manage Files"])
25
+
26
+ with tab1:
27
+ st.markdown("Ask a question based on a specific processed document.")
28
+
29
+ # Add a refresh button
30
+ if st.button("🔄 Refresh Document List"):
31
+ st.success("Document list refreshed!")
32
+ # No need to do anything else - Streamlit will rerun and refresh the list
33
+
34
+ # Fetch available document names
35
+ doc_names = get_processed_file_names()
36
+
37
+ # Select box with search
38
+ file_name = st.selectbox(
39
+ "📁 Select a Document",
40
+ doc_names,
41
+ index=0 if doc_names else None,
42
+ placeholder="Type to search...",
43
+ )
44
+
45
+ # User question
46
+ query_text = st.text_input(
47
+ "🧠 Your Question",
48
+ placeholder="e.g. What are the treatment steps for diabetes?",
49
+ )
50
+
51
+ if st.button("Ask"):
52
+ if not query_text or not file_name:
53
+ st.warning("Please fill in both the question and select a document.")
54
+ else:
55
+ with st.spinner("Processing..."):
56
+ try:
57
+ response = ask_question(query_text, file_name)
58
+
59
+ if response:
60
+ st.success("✅ Answer:")
61
+ st.markdown(f"**{response['answer']}**")
62
+
63
+ with st.expander("📚 Retrieved Chunks"):
64
+ for i, chunk in enumerate(response["chunks"]):
65
+ st.markdown(f"**Chunk {i+1}:** {chunk}")
66
+ else:
67
+ st.error(f"Error in the answer")
68
+
69
+ except Exception as e:
70
+ st.error(f"Failed to connect to the backend: {e}")
71
+
72
+ with tab2:
73
+ st.markdown("Upload new documents to be processed for the RAG system.")
74
+
75
+ # Ensure directories exist
76
+ os.makedirs("docs/unprocessed", exist_ok=True)
77
+ os.makedirs("docs/processed", exist_ok=True)
78
+
79
+ # File uploader
80
+ uploaded_file = st.file_uploader("Upload PDF Document", type=["pdf"])
81
+
82
+ if uploaded_file is not None:
83
+ st.info(f"File '{uploaded_file.name}' ready for upload")
84
+
85
+ # Create columns for buttons
86
+ col1, col2 = st.columns(2)
87
+
88
+ # Upload button
89
+ if col1.button("Upload to System"):
90
+ try:
91
+ # Save the uploaded file to the docs/unprocessed directory
92
+ with open(os.path.join("docs/unprocessed", uploaded_file.name), "wb") as f:
93
+ f.write(uploaded_file.getbuffer())
94
+ st.success(f"File '{uploaded_file.name}' saved to docs/unprocessed/")
95
+ except Exception as e:
96
+ st.error(f"Error saving file: {e}")
97
+
98
+ # Ingest button
99
+ if col2.button("Process Document"):
100
+ try:
101
+ with st.spinner("Processing document... This may take a while."):
102
+ # Call the ingestion script
103
+ result = ingest()
104
+
105
+ if result:
106
+ st.success("Document processed successfully!")
107
+ # Refresh the list of available documents
108
+ doc_names = get_processed_file_names()
109
+ else:
110
+ st.error(f"Error processing document")
111
+ except Exception as e:
112
+ st.error(f"Error running ingestion process: {e}")
113
+
114
+ # Display list of files in unprocessed folder
115
+ st.subheader("Unprocessed Documents")
116
+ try:
117
+ unprocessed_files = os.listdir("docs/unprocessed")
118
+ if unprocessed_files:
119
+ for file in unprocessed_files:
120
+ st.text(f"• {file}")
121
+ else:
122
+ st.info("No unprocessed documents.")
123
+ except Exception as e:
124
+ st.error(f"Error reading unprocessed folder: {e}")
125
+
126
+ # Display list of processed files
127
+ st.subheader("Processed Documents")
128
+ try:
129
+ processed_files = os.listdir("docs/processed")
130
+ processed_files = [f for f in processed_files if f.endswith(".pdf")]
131
+ if processed_files:
132
+ for file in processed_files:
133
+ st.text(f"• {file}")
134
+ else:
135
+ st.info("No processed documents.")
136
+ except Exception as e:
137
+ st.error(f"Error reading processed folder: {e}")
138
+
139
+ with tab3:
140
+ st.markdown(
141
+ "Manage your documents by deleting files from processed or unprocessed folders."
142
+ )
143
+
144
+ col1, col2 = st.columns(2)
145
+
146
+ with col1:
147
+ st.subheader("Delete Unprocessed Documents")
148
+ try:
149
+ unprocessed_files = os.listdir("docs/unprocessed")
150
+ if unprocessed_files:
151
+ file_to_delete_unprocessed = st.selectbox(
152
+ "Select file to delete from unprocessed folder",
153
+ unprocessed_files,
154
+ key="unprocessed_select",
155
+ )
156
+
157
+ if st.button("Delete Unprocessed File", key="delete_unprocessed"):
158
+ try:
159
+ file_path = os.path.join(
160
+ "docs/unprocessed", file_to_delete_unprocessed
161
+ )
162
+ os.remove(file_path)
163
+ st.success(f"Successfully deleted {file_to_delete_unprocessed}")
164
+ # Force refresh the app to show the updated file list
165
+ st.rerun()
166
+ except Exception as e:
167
+ st.error(f"Error deleting file: {e}")
168
+ else:
169
+ st.info("No unprocessed documents to delete.")
170
+ except Exception as e:
171
+ st.error(f"Error accessing unprocessed folder: {e}")
172
+
173
+ with col2:
174
+ st.subheader("Delete Processed Documents")
175
+ try:
176
+ processed_files = [
177
+ f for f in os.listdir("docs/processed") if f.endswith(".pdf")
178
+ ]
179
+ if processed_files:
180
+ file_to_delete_processed = st.selectbox(
181
+ "Select file to delete from processed folder",
182
+ processed_files,
183
+ key="processed_select",
184
+ )
185
+
186
+ if st.button("Delete Processed File", key="delete_processed"):
187
+ try:
188
+ # Delete the PDF file
189
+ pdf_path = os.path.join(
190
+ "docs/processed", file_to_delete_processed
191
+ )
192
+ os.remove(pdf_path)
193
+
194
+ # Also delete the corresponding vector store if it exists
195
+ base_name = file_to_delete_processed[
196
+ :-4
197
+ ] # Remove .pdf extension
198
+ vector_store_path = os.path.join(
199
+ "docs/processed", f"{base_name}.faiss"
200
+ )
201
+ if os.path.exists(vector_store_path):
202
+ os.remove(vector_store_path)
203
+
204
+ # Delete metadata file if it exists
205
+ metadata_path = os.path.join(
206
+ "docs/processed", f"{base_name}_metadata.json"
207
+ )
208
+ if os.path.exists(metadata_path):
209
+ os.remove(metadata_path)
210
+
211
+ # Remove document from Chroma DB
212
+ with st.spinner("Removing document from vector database..."):
213
+ remove_from_chroma(base_name)
214
+
215
+ st.success(
216
+ f"Successfully deleted {file_to_delete_processed} and related files"
217
+ )
218
+ # Force refresh the app to show the updated file list
219
+ st.rerun()
220
+ except Exception as e:
221
+ st.error(f"Error deleting file: {e}")
222
+ else:
223
+ st.info("No processed documents to delete.")
224
+ except Exception as e:
225
+ st.error(f"Error accessing processed folder: {e}")
226
+
227
+ # Add a separator
228
+ st.markdown("---")
229
+
230
+ # Delete all files section
231
+ st.subheader("Bulk Operations")
232
+ col3, col4 = st.columns(2)
233
+
234
+ with col3:
235
+ if st.button(
236
+ "Delete ALL Unprocessed Files", type="primary", use_container_width=True
237
+ ):
238
+ try:
239
+ unprocessed_files = os.listdir("docs/unprocessed")
240
+ if unprocessed_files:
241
+ for file in unprocessed_files:
242
+ os.remove(os.path.join("docs/unprocessed", file))
243
+ st.success(
244
+ f"Successfully deleted all {len(unprocessed_files)} unprocessed files"
245
+ )
246
+ # Force refresh
247
+ st.rerun()
248
+ else:
249
+ st.info("No files to delete.")
250
+ except Exception as e:
251
+ st.error(f"Error during bulk deletion: {e}")
252
+
253
+ with col4:
254
+ if st.button(
255
+ "Delete ALL Processed Files", type="primary", use_container_width=True
256
+ ):
257
+ try:
258
+ processed_files = os.listdir("docs/processed")
259
+ if processed_files:
260
+ for file in processed_files:
261
+ file_path = os.path.join("docs/processed", file)
262
+ os.remove(file_path)
263
+
264
+ # If it's a PDF file, also remove from Chroma
265
+ if file.endswith(".pdf"):
266
+ base_name = file[:-4] # Remove .pdf extension
267
+ remove_from_chroma(base_name)
268
+
269
+ st.success(
270
+ f"Successfully deleted all {len(processed_files)} processed files"
271
+ )
272
+ # Force refresh
273
+ st.rerun()
274
+ else:
275
+ st.info("No files to delete.")
276
+ except Exception as e:
277
+ st.error(f"Error during bulk deletion: {e}")
certification_processing.log ADDED
File without changes
chroma_operations/delete_chroma.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from typing import Optional
4
+ import chromadb
5
+ from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
6
+ from dotenv import load_dotenv
7
+ import shutil
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+
12
+ # Setup logging
13
+ logging.basicConfig(
14
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
15
+ )
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def remove_from_chroma(
20
+ document_name: str, collection_name: str = "rag_collection"
21
+ ) -> bool:
22
+ """
23
+ Remove a document and its chunks from the Chroma vector database.
24
+
25
+ Args:
26
+ document_name (str): The base name of the document (without .pdf extension)
27
+ collection_name (str): Name of the collection in ChromaDB (default: "rag_collection")
28
+
29
+ Returns:
30
+ bool: True if successful, False otherwise
31
+ """
32
+ try:
33
+ logger.info(f"Attempting to remove document '{document_name}' from Chroma DB")
34
+
35
+ # Check if Chroma DB exists
36
+ chroma_path = "./chroma_db"
37
+ if not os.path.exists(chroma_path):
38
+ logger.warning("Chroma DB directory does not exist")
39
+ return False
40
+
41
+ # Initialize embedding function and Chroma client
42
+ embedding_function = OpenAIEmbeddingFunction(
43
+ api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
44
+ )
45
+
46
+ # Connect to the persistent client
47
+ client = chromadb.PersistentClient(path=chroma_path)
48
+
49
+ # Get the collection
50
+ try:
51
+ collection = client.get_collection(
52
+ name=collection_name, embedding_function=embedding_function
53
+ )
54
+ except Exception as e:
55
+ logger.error(f"Collection '{collection_name}' not found: {e}")
56
+ return False
57
+
58
+ # Delete documents where source_file matches the document_name
59
+ try:
60
+ # First, get the IDs of chunks belonging to this document
61
+ results = collection.get(where={"source_file": document_name})
62
+
63
+ ids_to_delete = results.get("ids", [])
64
+
65
+ if not ids_to_delete:
66
+ logger.warning(
67
+ f"No chunks found for document '{document_name}' in collection"
68
+ )
69
+ return True # Nothing to delete, so consider it successful
70
+
71
+ # Delete chunks by IDs
72
+ collection.delete(ids=ids_to_delete)
73
+
74
+ logger.info(
75
+ f"Successfully deleted {len(ids_to_delete)} chunks for '{document_name}' from ChromaDB"
76
+ )
77
+ return True
78
+
79
+ except Exception as e:
80
+ logger.error(f"Error deleting chunks from collection: {e}")
81
+ return False
82
+
83
+ except Exception as e:
84
+ logger.error(f"Error removing document from Chroma DB: {e}")
85
+ return False
86
+
87
+
88
+ def delete_all_from_chroma(collection_name: str = "rag_collection") -> bool:
89
+ """
90
+ Delete all documents from the specified ChromaDB collection.
91
+
92
+ Args:
93
+ collection_name (str): Name of the collection in ChromaDB (default: "rag_collection")
94
+
95
+ Returns:
96
+ bool: True if successful, False otherwise
97
+ """
98
+ try:
99
+ # Initialize embedding function and Chroma client
100
+ embedding_function = OpenAIEmbeddingFunction(
101
+ api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
102
+ )
103
+
104
+ # Connect to the persistent client
105
+ client = chromadb.PersistentClient(path="./chroma_db")
106
+
107
+ try:
108
+ # Get the collection
109
+ collection = client.get_collection(
110
+ name=collection_name, embedding_function=embedding_function
111
+ )
112
+
113
+ # Delete all documents in the collection
114
+ collection.delete()
115
+ logger.info(
116
+ f"Successfully deleted all documents from collection '{collection_name}'"
117
+ )
118
+ return True
119
+
120
+ except Exception as e:
121
+ logger.error(f"Error accessing or deleting collection: {e}")
122
+ return False
123
+
124
+ except Exception as e:
125
+ logger.error(f"Error connecting to Chroma DB: {e}")
126
+ return False
127
+
128
+
129
+ def reset_chroma_db() -> bool:
130
+ """
131
+ Reset the entire Chroma database by deleting and recreating the directory.
132
+
133
+ Returns:
134
+ bool: True if successful, False otherwise
135
+ """
136
+ try:
137
+ chroma_path = "./chroma_db"
138
+ if os.path.exists(chroma_path):
139
+ # Delete the entire Chroma directory
140
+ shutil.rmtree(chroma_path)
141
+ logger.info("Successfully deleted entire Chroma DB directory")
142
+
143
+ # Create an empty directory
144
+ os.makedirs(chroma_path, exist_ok=True)
145
+ logger.info("Created fresh Chroma DB directory")
146
+ return True
147
+ else:
148
+ logger.warning("Chroma DB directory does not exist")
149
+ return False
150
+ except Exception as e:
151
+ logger.error(f"Error resetting Chroma DB: {e}")
152
+ return False
chroma_operations/ingestion.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import logging
4
+ from datetime import datetime
5
+ import hashlib
6
+ from typing import List
7
+ import chromadb
8
+ import openai
9
+ import os
10
+ import shutil
11
+ import logging
12
+ from datetime import datetime
13
+ import hashlib
14
+ from typing import List, Optional
15
+ import chromadb
16
+ from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
17
+ from dotenv import load_dotenv
18
+
19
+ load_dotenv()
20
+
21
+
22
+ from semantic_chunking import semantic_chunking
23
+ from chroma_operations.pdf_processing import extract_pdf_content
24
+
25
+ # Configure logging
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format="%(asctime)s - %(levelname)s - %(message)s",
29
+ handlers=[
30
+ logging.FileHandler("certification_processing.log"),
31
+ logging.StreamHandler(),
32
+ ],
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ def generate_chunk_id(file: str, chunk: str, position: int) -> str:
38
+ unique_str = f"{file}_{position}_{chunk}"
39
+ return hashlib.sha256(unique_str.encode()).hexdigest()
40
+
41
+
42
+ def move_processed_file(source_path: str, destination_dir: str) -> bool:
43
+ """Move a processed file to the destination directory."""
44
+ try:
45
+ if not os.path.exists(destination_dir):
46
+ os.makedirs(destination_dir)
47
+ destination_path = os.path.join(destination_dir, os.path.basename(source_path))
48
+ os.rename(source_path, destination_path)
49
+ return True
50
+ except Exception as e:
51
+ logger.error(f"Error moving file {source_path}: {str(e)}")
52
+ return False
53
+
54
+
55
+ def get_chroma_client():
56
+ """Initialize ChromaDB client with OpenAI embeddings."""
57
+ try:
58
+ # Initialize embedding function
59
+ embedding_function = OpenAIEmbeddingFunction(
60
+ api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
61
+ )
62
+
63
+ client = chromadb.PersistentClient(path="./chroma_db")
64
+ logger.info("Successfully connected to ChromaDB with OpenAI embeddings")
65
+ return client, embedding_function
66
+ except Exception as e:
67
+ logger.error(f"Error connecting to ChromaDB: {str(e)}")
68
+ logger.exception("Detailed stack trace:")
69
+ return None, None
70
+
71
+
72
+ def create_chroma_collection(
73
+ client, embedding_function, collection_name="rag_collection"
74
+ ):
75
+ """Create or get a ChromaDB collection with proper embedding function."""
76
+ try:
77
+ collection = client.get_or_create_collection(
78
+ name=collection_name,
79
+ embedding_function=embedding_function,
80
+ metadata={"description": "medical documents"},
81
+ )
82
+ logger.info(f"Initialized collection: {collection_name}")
83
+ return collection
84
+ except Exception as e:
85
+ logger.error(f"Error creating ChromaDB collection: {str(e)}")
86
+ logger.exception("Detailed stack trace:")
87
+ return None
88
+
89
+
90
+ def process_file(file_path: str, collection) -> bool:
91
+ """Process a file and add to ChromaDB collection."""
92
+ try:
93
+ if file_path.endswith(".pdf"):
94
+ contents = extract_pdf_content(file_path)
95
+ else:
96
+ logger.warning(f"Skipping unsupported file type: {file_path}")
97
+ return False
98
+
99
+ if not contents:
100
+ logger.warning(f"No content extracted from file: {file_path}")
101
+ return False
102
+
103
+ chunks: List[str] = []
104
+ for content in contents:
105
+ if not content.strip():
106
+ continue
107
+ if "\t" in content or "[TABLE]" in content:
108
+ chunks.append(content)
109
+ else:
110
+ try:
111
+ chunks.extend(semantic_chunking(content))
112
+ except Exception as e:
113
+ logger.error(
114
+ f"Error during chunking for file {file_path}: {str(e)}"
115
+ )
116
+ continue
117
+
118
+ if not chunks:
119
+ logger.warning(f"No valid chunks created for file: {file_path}")
120
+ return False
121
+
122
+ documents = []
123
+ metadatas = []
124
+ ids = []
125
+
126
+ source_file = os.path.basename(file_path)
127
+ if source_file.endswith(".pdf"):
128
+ source_file = source_file[:-4].strip()
129
+
130
+ for i, chunk in enumerate(chunks):
131
+ if not chunk.strip():
132
+ continue
133
+
134
+ try:
135
+ chunk_id = generate_chunk_id(file_path, chunk, i)
136
+ documents.append(chunk)
137
+ metadatas.append(
138
+ {
139
+ "chunk_id": chunk_id,
140
+ "source_file": source_file,
141
+ "position": i,
142
+ "processed_at": datetime.now().isoformat(),
143
+ }
144
+ )
145
+ ids.append(chunk_id)
146
+
147
+ except Exception as e:
148
+ logger.error(f"Error processing chunk from file {file_path}: {str(e)}")
149
+ continue
150
+
151
+ if documents:
152
+ try:
153
+ # Chroma will automatically generate embeddings using the collection's embedding function
154
+ collection.add(documents=documents, metadatas=metadatas, ids=ids)
155
+ logger.info(
156
+ f"Added {len(documents)} chunks from {file_path} to ChromaDB"
157
+ )
158
+ return True
159
+ except Exception as e:
160
+ logger.error(f"Error adding documents to ChromaDB: {str(e)}")
161
+ return False
162
+ return False
163
+
164
+ except Exception as e:
165
+ logger.error(f"Error processing file {file_path}: {str(e)}")
166
+ return False
167
+
168
+
169
+ def ingest():
170
+ try:
171
+ # Get client and embedding function together
172
+ chroma_client, embedding_function = get_chroma_client()
173
+ if not chroma_client or not embedding_function:
174
+ logger.error("Failed to initialize ChromaDB with embeddings")
175
+ return False
176
+
177
+ collection = create_chroma_collection(chroma_client, embedding_function)
178
+ if not collection:
179
+ logger.error("Failed to create or get ChromaDB collection")
180
+ return False
181
+
182
+ logger.info(f"Collection ready: {collection.name}")
183
+
184
+ unprocessed_dir = "docs/unprocessed"
185
+ processed_dir = "docs/processed"
186
+
187
+ if not os.path.exists(unprocessed_dir):
188
+ logger.error(f"Directory not found: {unprocessed_dir}")
189
+ return False
190
+
191
+ for file in os.listdir(unprocessed_dir):
192
+ file_path = os.path.join(unprocessed_dir, file)
193
+
194
+ if not os.path.isfile(file_path) or not file.lower().endswith(".pdf"):
195
+ continue
196
+
197
+ logger.info(f"Processing file: {file_path}")
198
+
199
+ if process_file(file_path, collection):
200
+ if not move_processed_file(file_path, processed_dir):
201
+ logger.error(f"Failed to move processed file: {file_path}")
202
+ else:
203
+ logger.error(f"Failed to process file: {file_path}")
204
+
205
+ logger.info("Processing completed")
206
+ return True
207
+
208
+ except Exception as e:
209
+ logger.error(f"Fatal error in ingestion: {str(e)}")
210
+ logger.exception("Detailed stack trace:")
211
+ return False
chroma_operations/pdf_processing.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import logging
3
+ from typing import List, Union, Tuple
4
+ import os
5
+
6
+
7
+ # Set up logging
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def extract_pdf_content(pdf_path: str) -> List[str]:
13
+ """
14
+ Extract text and tables from PDF in their natural reading order.
15
+ Simplified version without positional processing.
16
+
17
+ Args:
18
+ pdf_path (str): Path to the PDF file
19
+
20
+ Returns:
21
+ List[str]: List of extracted content chunks (text and tables)
22
+ """
23
+ if not os.path.exists(pdf_path):
24
+ logger.error(f"PDF file not found: {pdf_path}")
25
+ return []
26
+
27
+ try:
28
+ with pdfplumber.open(pdf_path) as pdf:
29
+ content = []
30
+
31
+ for page in pdf.pages:
32
+ # First extract tables
33
+ tables = page.extract_tables()
34
+ for table in tables:
35
+ if table:
36
+ # Convert table to string representation
37
+ table_str = "\n".join(
38
+ ["\t".join(str(cell) for cell in row) for row in table]
39
+ )
40
+ content.append(f"[TABLE]\n{table_str}\n[/TABLE]")
41
+
42
+ # Then extract regular text
43
+ text = page.extract_text()
44
+ if text and text.strip():
45
+ content.append(text.strip())
46
+
47
+ logger.info(f"Successfully extracted content from {pdf_path}")
48
+ return content
49
+
50
+ except Exception as e:
51
+ logger.error(f"Error processing {pdf_path}: {str(e)}")
52
+ return []
chroma_operations/retrieve.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from typing import List, Optional
4
+ import chromadb
5
+ from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ # Setup logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def search_similar_chunks(
17
+ query_text: str,
18
+ document_name: str,
19
+ collection_name: str = "rag_collection",
20
+ top_k: int = 5,
21
+ ):
22
+ """Search for top-k chunks similar to query_text within a specific document (source_file)."""
23
+ try:
24
+ # Initialize embedding function and Chroma client
25
+ embedding_function = OpenAIEmbeddingFunction(
26
+ api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
27
+ )
28
+ client = chromadb.PersistentClient(path="./chroma_db")
29
+
30
+ # Load the collection
31
+ collection = client.get_collection(
32
+ name=collection_name, embedding_function=embedding_function
33
+ )
34
+
35
+ # Query similar documents filtered by document_name
36
+ results = collection.query(
37
+ query_texts=[query_text],
38
+ n_results=top_k,
39
+ where={"source_file": document_name},
40
+ )
41
+
42
+ documents = results.get("documents", [[]])[0]
43
+ metadatas = results.get("metadatas", [[]])[0]
44
+
45
+ return documents
46
+
47
+ except Exception as e:
48
+ logger.error(f"Similarity search failed: {str(e)}")
49
+ return []
llms.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain.prompts import PromptTemplate
4
+ from langchain_groq import ChatGroq
5
+ from typing import Literal
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+
11
+ # Initialize LLMs
12
+ def initialize_llms():
13
+ """Initialize and return the LLM instances"""
14
+ groq_api_key = os.getenv("GROQ_API_KEY")
15
+
16
+ return {
17
+ "llm": ChatGroq(
18
+ temperature=0.1, model="llama-3.3-70b-versatile", api_key=groq_api_key
19
+ ),
20
+ "step_back_llm": ChatGroq(
21
+ temperature=0, model="Gemma2-9B-IT", api_key=groq_api_key
22
+ ),
23
+ }
24
+
25
+
26
+ # Query refinement
27
+ def refine_query(query: str, llm: ChatGroq) -> str:
28
+ """Enhance pediatric medicine queries for better retrieval while preserving clinical intent"""
29
+ template = """
30
+ You are a medical language expert. Your task is to improve the following user question by:
31
+
32
+ - Correcting any grammatical or spelling errors
33
+ - Clarifying vague or ambiguous wording
34
+ - Improving sentence structure for readability and precision
35
+ - Maintaining the original meaning and clinical focus
36
+
37
+ Do not add new information. Do not expand abbreviations unless they are unclear. Do not include any commentary or explanation.
38
+
39
+ Original query: {original_query}
40
+
41
+ Improved medical question:
42
+ """
43
+
44
+ prompt = PromptTemplate(input_variables=["original_query"], template=template)
45
+
46
+ chain = prompt | llm
47
+ return chain.invoke({"original_query": query}).content
48
+
49
+
50
+ def query_to_retrieve(query, llm):
51
+ """Convert a query to a format suitable for retrieval"""
52
+ template = """
53
+ You are an expert in pediatric medical information retrieval.
54
+
55
+ Your task is to rewrite the following question into a single, concise sentence containing only the most relevant medical and pediatric concepts. This sentence will be used for semantic search in a vector database.
56
+
57
+ Instructions:
58
+ - Include only the core clinical focus (conditions, symptoms, treatments, procedures).
59
+ - Mention pediatric-specific entities if relevant (e.g., age group, child-specific medication).
60
+ - Remove all conversational language and filler.
61
+ - Preserve the original intent.
62
+ - Output only one clean, search-optimized sentence.
63
+
64
+ Original query: {original_query}
65
+
66
+ Search-ready query:
67
+ """
68
+
69
+ prompt = PromptTemplate(input_variables=["original_query"], template=template)
70
+
71
+ chain = prompt | llm
72
+ return chain.invoke({"original_query": query}).content
73
+
74
+
75
+ def answer_query_with_chunks(
76
+ query: str,
77
+ retrieved_docs,
78
+ llm: ChatGroq,
79
+ ) -> str:
80
+ try:
81
+ # Embed query using the same embedding function
82
+ query_improved = refine_query(query, llm)
83
+
84
+ if not retrieved_docs:
85
+ return "Sorry, no relevant medical information was found."
86
+
87
+ # Construct context for the LLM
88
+ context = "\n\n".join(retrieved_docs)
89
+
90
+ system_prompt = """
91
+ You are a pediatric medical assistant.
92
+
93
+ Based only on the provided context, answer the user's question concisely and accurately but with the necessary explanation.
94
+ If the answer is not present in the context, say: "The answer is not available in the current documents."
95
+
96
+ Context:
97
+ {context}
98
+
99
+ User question:
100
+ {query}
101
+
102
+ Answer:
103
+ """
104
+
105
+ prompt = PromptTemplate(
106
+ input_variables=["context", "query"],
107
+ template=system_prompt,
108
+ )
109
+
110
+ chain = prompt | llm
111
+ return chain.invoke({"context": context, "query": query_improved}).content
112
+
113
+ except Exception as e:
114
+ return f"An error occurred while answering the query: {str(e)}"
rag.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from llms import initialize_llms, answer_query_with_chunks, query_to_retrieve
4
+ from chroma_operations.retrieve import search_similar_chunks
5
+
6
+ # Initialize FastAPI
7
+ app = FastAPI()
8
+
9
+
10
+ # Define request model
11
+ class RAGRequest(BaseModel):
12
+ query_text: str
13
+ file_name: str
14
+ collection_name: str = "rag_collection"
15
+
16
+ # Load LLM once at startup
17
+ llms = initialize_llms()
18
+ llm = llms["llm"]
19
+
20
+
21
+ @app.post("/ask")
22
+ def ask_question(query_text, file_name, collection_name="rag_collection"):
23
+ try:
24
+ query_search = query_to_retrieve(query_text, llm)
25
+ retrieved_docs = search_similar_chunks(
26
+ query_search, file_name, collection_name
27
+ )
28
+
29
+ if not retrieved_docs:
30
+ raise HTTPException(status_code=404, detail="No matching documents found.")
31
+
32
+ answer = answer_query_with_chunks(query_text, retrieved_docs, llm)
33
+ return {"answer": answer, "chunks": retrieved_docs}
34
+
35
+ except Exception as e:
36
+ raise HTTPException(status_code=500, detail=str(e))
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ sentence-transformers
2
+ pdfplumber
3
+ langchain
4
+ langchain-community
5
+ chroma
6
+ openai
7
+ streamlit
8
+ requests
9
+ chromadb
10
+ langchain-groq
semantic_chunking.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sentence_transformers import SentenceTransformer
3
+
4
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
5
+
6
+
7
+ def hybrid_split(text: str, max_len: int = 1024) -> list[str]:
8
+ """
9
+ Split text into chunks respecting sentence boundaries when possible,
10
+ with optional overlap between chunks.
11
+
12
+ Args:
13
+ text: The text to split
14
+ max_len: Maximum length for each chunk
15
+
16
+ Returns:
17
+ List of text chunks
18
+ """
19
+ # Normalize text
20
+ text = text.replace("\r", "").replace("\n", " ").strip()
21
+
22
+ # Extract sentences (more robust regex for sentence detection)
23
+ import re
24
+
25
+ sentences = re.split(r"(?<=[.!?])\s+", text)
26
+
27
+ chunks = []
28
+ current_chunk = ""
29
+
30
+ for sentence in sentences:
31
+ if len(sentence) > max_len:
32
+ # First add the current chunk if it exists
33
+ chunks.append(sentence)
34
+
35
+ # Normal case - see if adding the sentence exceeds max_len
36
+ elif len(current_chunk) + len(sentence) + 1 > max_len:
37
+ # Add the current chunk and start a new one
38
+ chunks.append(current_chunk)
39
+ current_chunk = ""
40
+ else:
41
+ # Add to the current chunk
42
+ if current_chunk:
43
+ current_chunk += " " + sentence
44
+ else:
45
+ current_chunk = sentence
46
+
47
+ if current_chunk:
48
+ chunks.append(current_chunk)
49
+
50
+ return chunks
51
+
52
+
53
+ def cosine_similarity(vec1, vec2):
54
+ """Calculate the cosine similarity between two vectors."""
55
+ dot_product = np.dot(vec1, vec2)
56
+ norm_vec1 = np.linalg.norm(vec1)
57
+ norm_vec2 = np.linalg.norm(vec2)
58
+ return dot_product / (norm_vec1 * norm_vec2)
59
+
60
+
61
+ def get_embedding(text):
62
+ """Generate an embedding using SBERT."""
63
+ return embedding_model.encode(text, convert_to_numpy=True)
64
+
65
+
66
+ def semantic_chunking(text, threshold=0.75, max_chunk_size=8191):
67
+ """
68
+ Splits text into semantic chunks based on sentence similarity.
69
+ - threshold: Lower = more splits, Higher = fewer splits
70
+ - max_chunk_size: Maximum size of each chunk in characters
71
+ """
72
+ text = text.replace("\n", " ").replace("\r", " ").strip()
73
+ sentences = hybrid_split(text)
74
+ embeddings = [get_embedding(sent) for sent in sentences]
75
+
76
+ chunks = []
77
+ current_chunk = [sentences[0]]
78
+
79
+ for i in range(1, len(sentences)):
80
+ sim = cosine_similarity(embeddings[i - 1], embeddings[i])
81
+ if (
82
+ sim < threshold
83
+ or len(" ".join(current_chunk + [sentences[i]])) > max_chunk_size
84
+ ):
85
+ chunks.append(" ".join(current_chunk))
86
+ current_chunk = [sentences[i]]
87
+ else:
88
+ current_chunk.append(sentences[i])
89
+
90
+ if current_chunk:
91
+ chunks.append(" ".join(current_chunk))
92
+
93
+ return chunks