Spaces:
Running
Running
Upload 12 files
Browse files- .gitignore +174 -0
- README.md +1 -10
- app.py +277 -0
- certification_processing.log +0 -0
- chroma_operations/delete_chroma.py +152 -0
- chroma_operations/ingestion.py +211 -0
- chroma_operations/pdf_processing.py +52 -0
- chroma_operations/retrieve.py +49 -0
- llms.py +114 -0
- rag.py +36 -0
- requirements.txt +10 -0
- semantic_chunking.py +93 -0
.gitignore
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# UV
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
#uv.lock
|
102 |
+
|
103 |
+
# poetry
|
104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
106 |
+
# commonly ignored for libraries.
|
107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
108 |
+
#poetry.lock
|
109 |
+
|
110 |
+
# pdm
|
111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
112 |
+
#pdm.lock
|
113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
114 |
+
# in version control.
|
115 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
116 |
+
.pdm.toml
|
117 |
+
.pdm-python
|
118 |
+
.pdm-build/
|
119 |
+
|
120 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
121 |
+
__pypackages__/
|
122 |
+
|
123 |
+
# Celery stuff
|
124 |
+
celerybeat-schedule
|
125 |
+
celerybeat.pid
|
126 |
+
|
127 |
+
# SageMath parsed files
|
128 |
+
*.sage.py
|
129 |
+
|
130 |
+
# Environments
|
131 |
+
.env
|
132 |
+
.venv
|
133 |
+
env/
|
134 |
+
venv/
|
135 |
+
ENV/
|
136 |
+
env.bak/
|
137 |
+
venv.bak/
|
138 |
+
|
139 |
+
# Spyder project settings
|
140 |
+
.spyderproject
|
141 |
+
.spyproject
|
142 |
+
|
143 |
+
# Rope project settings
|
144 |
+
.ropeproject
|
145 |
+
|
146 |
+
# mkdocs documentation
|
147 |
+
/site
|
148 |
+
|
149 |
+
# mypy
|
150 |
+
.mypy_cache/
|
151 |
+
.dmypy.json
|
152 |
+
dmypy.json
|
153 |
+
|
154 |
+
# Pyre type checker
|
155 |
+
.pyre/
|
156 |
+
|
157 |
+
# pytype static type analyzer
|
158 |
+
.pytype/
|
159 |
+
|
160 |
+
# Cython debug symbols
|
161 |
+
cython_debug/
|
162 |
+
|
163 |
+
# PyCharm
|
164 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
165 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
166 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
167 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
168 |
+
#.idea/
|
169 |
+
|
170 |
+
# Ruff stuff:
|
171 |
+
.ruff_cache/
|
172 |
+
|
173 |
+
# PyPI configuration file
|
174 |
+
.pypirc
|
README.md
CHANGED
@@ -1,10 +1 @@
|
|
1 |
-
|
2 |
-
title: Rag Medical
|
3 |
-
emoji: 🔥
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: gray
|
6 |
-
sdk: static
|
7 |
-
pinned: false
|
8 |
-
---
|
9 |
-
|
10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# albariqi-rag
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
from chroma_operations.ingestion import ingest
|
4 |
+
from rag import ask_question
|
5 |
+
from chroma_operations.delete_chroma import remove_from_chroma
|
6 |
+
import json
|
7 |
+
|
8 |
+
|
9 |
+
# Get list of processed PDF file names without `.pdf`
|
10 |
+
def get_processed_file_names(folder_path="docs/processed"):
|
11 |
+
try:
|
12 |
+
files = os.listdir(folder_path)
|
13 |
+
pdfs = [f[:-4] for f in files if f.endswith(".pdf")]
|
14 |
+
return sorted(pdfs)
|
15 |
+
except Exception as e:
|
16 |
+
st.error(f"Error reading folder: {e}")
|
17 |
+
return []
|
18 |
+
|
19 |
+
|
20 |
+
st.set_page_config(page_title="RAG Demo", layout="centered")
|
21 |
+
st.title("📄 Retrieval-Augmented Generation (RAG) Demo")
|
22 |
+
|
23 |
+
# Create tabs for different functionalities
|
24 |
+
tab1, tab2, tab3 = st.tabs(["Ask Questions", "Upload Documents", "Manage Files"])
|
25 |
+
|
26 |
+
with tab1:
|
27 |
+
st.markdown("Ask a question based on a specific processed document.")
|
28 |
+
|
29 |
+
# Add a refresh button
|
30 |
+
if st.button("🔄 Refresh Document List"):
|
31 |
+
st.success("Document list refreshed!")
|
32 |
+
# No need to do anything else - Streamlit will rerun and refresh the list
|
33 |
+
|
34 |
+
# Fetch available document names
|
35 |
+
doc_names = get_processed_file_names()
|
36 |
+
|
37 |
+
# Select box with search
|
38 |
+
file_name = st.selectbox(
|
39 |
+
"📁 Select a Document",
|
40 |
+
doc_names,
|
41 |
+
index=0 if doc_names else None,
|
42 |
+
placeholder="Type to search...",
|
43 |
+
)
|
44 |
+
|
45 |
+
# User question
|
46 |
+
query_text = st.text_input(
|
47 |
+
"🧠 Your Question",
|
48 |
+
placeholder="e.g. What are the treatment steps for diabetes?",
|
49 |
+
)
|
50 |
+
|
51 |
+
if st.button("Ask"):
|
52 |
+
if not query_text or not file_name:
|
53 |
+
st.warning("Please fill in both the question and select a document.")
|
54 |
+
else:
|
55 |
+
with st.spinner("Processing..."):
|
56 |
+
try:
|
57 |
+
response = ask_question(query_text, file_name)
|
58 |
+
|
59 |
+
if response:
|
60 |
+
st.success("✅ Answer:")
|
61 |
+
st.markdown(f"**{response['answer']}**")
|
62 |
+
|
63 |
+
with st.expander("📚 Retrieved Chunks"):
|
64 |
+
for i, chunk in enumerate(response["chunks"]):
|
65 |
+
st.markdown(f"**Chunk {i+1}:** {chunk}")
|
66 |
+
else:
|
67 |
+
st.error(f"Error in the answer")
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
st.error(f"Failed to connect to the backend: {e}")
|
71 |
+
|
72 |
+
with tab2:
|
73 |
+
st.markdown("Upload new documents to be processed for the RAG system.")
|
74 |
+
|
75 |
+
# Ensure directories exist
|
76 |
+
os.makedirs("docs/unprocessed", exist_ok=True)
|
77 |
+
os.makedirs("docs/processed", exist_ok=True)
|
78 |
+
|
79 |
+
# File uploader
|
80 |
+
uploaded_file = st.file_uploader("Upload PDF Document", type=["pdf"])
|
81 |
+
|
82 |
+
if uploaded_file is not None:
|
83 |
+
st.info(f"File '{uploaded_file.name}' ready for upload")
|
84 |
+
|
85 |
+
# Create columns for buttons
|
86 |
+
col1, col2 = st.columns(2)
|
87 |
+
|
88 |
+
# Upload button
|
89 |
+
if col1.button("Upload to System"):
|
90 |
+
try:
|
91 |
+
# Save the uploaded file to the docs/unprocessed directory
|
92 |
+
with open(os.path.join("docs/unprocessed", uploaded_file.name), "wb") as f:
|
93 |
+
f.write(uploaded_file.getbuffer())
|
94 |
+
st.success(f"File '{uploaded_file.name}' saved to docs/unprocessed/")
|
95 |
+
except Exception as e:
|
96 |
+
st.error(f"Error saving file: {e}")
|
97 |
+
|
98 |
+
# Ingest button
|
99 |
+
if col2.button("Process Document"):
|
100 |
+
try:
|
101 |
+
with st.spinner("Processing document... This may take a while."):
|
102 |
+
# Call the ingestion script
|
103 |
+
result = ingest()
|
104 |
+
|
105 |
+
if result:
|
106 |
+
st.success("Document processed successfully!")
|
107 |
+
# Refresh the list of available documents
|
108 |
+
doc_names = get_processed_file_names()
|
109 |
+
else:
|
110 |
+
st.error(f"Error processing document")
|
111 |
+
except Exception as e:
|
112 |
+
st.error(f"Error running ingestion process: {e}")
|
113 |
+
|
114 |
+
# Display list of files in unprocessed folder
|
115 |
+
st.subheader("Unprocessed Documents")
|
116 |
+
try:
|
117 |
+
unprocessed_files = os.listdir("docs/unprocessed")
|
118 |
+
if unprocessed_files:
|
119 |
+
for file in unprocessed_files:
|
120 |
+
st.text(f"• {file}")
|
121 |
+
else:
|
122 |
+
st.info("No unprocessed documents.")
|
123 |
+
except Exception as e:
|
124 |
+
st.error(f"Error reading unprocessed folder: {e}")
|
125 |
+
|
126 |
+
# Display list of processed files
|
127 |
+
st.subheader("Processed Documents")
|
128 |
+
try:
|
129 |
+
processed_files = os.listdir("docs/processed")
|
130 |
+
processed_files = [f for f in processed_files if f.endswith(".pdf")]
|
131 |
+
if processed_files:
|
132 |
+
for file in processed_files:
|
133 |
+
st.text(f"• {file}")
|
134 |
+
else:
|
135 |
+
st.info("No processed documents.")
|
136 |
+
except Exception as e:
|
137 |
+
st.error(f"Error reading processed folder: {e}")
|
138 |
+
|
139 |
+
with tab3:
|
140 |
+
st.markdown(
|
141 |
+
"Manage your documents by deleting files from processed or unprocessed folders."
|
142 |
+
)
|
143 |
+
|
144 |
+
col1, col2 = st.columns(2)
|
145 |
+
|
146 |
+
with col1:
|
147 |
+
st.subheader("Delete Unprocessed Documents")
|
148 |
+
try:
|
149 |
+
unprocessed_files = os.listdir("docs/unprocessed")
|
150 |
+
if unprocessed_files:
|
151 |
+
file_to_delete_unprocessed = st.selectbox(
|
152 |
+
"Select file to delete from unprocessed folder",
|
153 |
+
unprocessed_files,
|
154 |
+
key="unprocessed_select",
|
155 |
+
)
|
156 |
+
|
157 |
+
if st.button("Delete Unprocessed File", key="delete_unprocessed"):
|
158 |
+
try:
|
159 |
+
file_path = os.path.join(
|
160 |
+
"docs/unprocessed", file_to_delete_unprocessed
|
161 |
+
)
|
162 |
+
os.remove(file_path)
|
163 |
+
st.success(f"Successfully deleted {file_to_delete_unprocessed}")
|
164 |
+
# Force refresh the app to show the updated file list
|
165 |
+
st.rerun()
|
166 |
+
except Exception as e:
|
167 |
+
st.error(f"Error deleting file: {e}")
|
168 |
+
else:
|
169 |
+
st.info("No unprocessed documents to delete.")
|
170 |
+
except Exception as e:
|
171 |
+
st.error(f"Error accessing unprocessed folder: {e}")
|
172 |
+
|
173 |
+
with col2:
|
174 |
+
st.subheader("Delete Processed Documents")
|
175 |
+
try:
|
176 |
+
processed_files = [
|
177 |
+
f for f in os.listdir("docs/processed") if f.endswith(".pdf")
|
178 |
+
]
|
179 |
+
if processed_files:
|
180 |
+
file_to_delete_processed = st.selectbox(
|
181 |
+
"Select file to delete from processed folder",
|
182 |
+
processed_files,
|
183 |
+
key="processed_select",
|
184 |
+
)
|
185 |
+
|
186 |
+
if st.button("Delete Processed File", key="delete_processed"):
|
187 |
+
try:
|
188 |
+
# Delete the PDF file
|
189 |
+
pdf_path = os.path.join(
|
190 |
+
"docs/processed", file_to_delete_processed
|
191 |
+
)
|
192 |
+
os.remove(pdf_path)
|
193 |
+
|
194 |
+
# Also delete the corresponding vector store if it exists
|
195 |
+
base_name = file_to_delete_processed[
|
196 |
+
:-4
|
197 |
+
] # Remove .pdf extension
|
198 |
+
vector_store_path = os.path.join(
|
199 |
+
"docs/processed", f"{base_name}.faiss"
|
200 |
+
)
|
201 |
+
if os.path.exists(vector_store_path):
|
202 |
+
os.remove(vector_store_path)
|
203 |
+
|
204 |
+
# Delete metadata file if it exists
|
205 |
+
metadata_path = os.path.join(
|
206 |
+
"docs/processed", f"{base_name}_metadata.json"
|
207 |
+
)
|
208 |
+
if os.path.exists(metadata_path):
|
209 |
+
os.remove(metadata_path)
|
210 |
+
|
211 |
+
# Remove document from Chroma DB
|
212 |
+
with st.spinner("Removing document from vector database..."):
|
213 |
+
remove_from_chroma(base_name)
|
214 |
+
|
215 |
+
st.success(
|
216 |
+
f"Successfully deleted {file_to_delete_processed} and related files"
|
217 |
+
)
|
218 |
+
# Force refresh the app to show the updated file list
|
219 |
+
st.rerun()
|
220 |
+
except Exception as e:
|
221 |
+
st.error(f"Error deleting file: {e}")
|
222 |
+
else:
|
223 |
+
st.info("No processed documents to delete.")
|
224 |
+
except Exception as e:
|
225 |
+
st.error(f"Error accessing processed folder: {e}")
|
226 |
+
|
227 |
+
# Add a separator
|
228 |
+
st.markdown("---")
|
229 |
+
|
230 |
+
# Delete all files section
|
231 |
+
st.subheader("Bulk Operations")
|
232 |
+
col3, col4 = st.columns(2)
|
233 |
+
|
234 |
+
with col3:
|
235 |
+
if st.button(
|
236 |
+
"Delete ALL Unprocessed Files", type="primary", use_container_width=True
|
237 |
+
):
|
238 |
+
try:
|
239 |
+
unprocessed_files = os.listdir("docs/unprocessed")
|
240 |
+
if unprocessed_files:
|
241 |
+
for file in unprocessed_files:
|
242 |
+
os.remove(os.path.join("docs/unprocessed", file))
|
243 |
+
st.success(
|
244 |
+
f"Successfully deleted all {len(unprocessed_files)} unprocessed files"
|
245 |
+
)
|
246 |
+
# Force refresh
|
247 |
+
st.rerun()
|
248 |
+
else:
|
249 |
+
st.info("No files to delete.")
|
250 |
+
except Exception as e:
|
251 |
+
st.error(f"Error during bulk deletion: {e}")
|
252 |
+
|
253 |
+
with col4:
|
254 |
+
if st.button(
|
255 |
+
"Delete ALL Processed Files", type="primary", use_container_width=True
|
256 |
+
):
|
257 |
+
try:
|
258 |
+
processed_files = os.listdir("docs/processed")
|
259 |
+
if processed_files:
|
260 |
+
for file in processed_files:
|
261 |
+
file_path = os.path.join("docs/processed", file)
|
262 |
+
os.remove(file_path)
|
263 |
+
|
264 |
+
# If it's a PDF file, also remove from Chroma
|
265 |
+
if file.endswith(".pdf"):
|
266 |
+
base_name = file[:-4] # Remove .pdf extension
|
267 |
+
remove_from_chroma(base_name)
|
268 |
+
|
269 |
+
st.success(
|
270 |
+
f"Successfully deleted all {len(processed_files)} processed files"
|
271 |
+
)
|
272 |
+
# Force refresh
|
273 |
+
st.rerun()
|
274 |
+
else:
|
275 |
+
st.info("No files to delete.")
|
276 |
+
except Exception as e:
|
277 |
+
st.error(f"Error during bulk deletion: {e}")
|
certification_processing.log
ADDED
File without changes
|
chroma_operations/delete_chroma.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from typing import Optional
|
4 |
+
import chromadb
|
5 |
+
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import shutil
|
8 |
+
|
9 |
+
# Load environment variables
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
# Setup logging
|
13 |
+
logging.basicConfig(
|
14 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
15 |
+
)
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
|
19 |
+
def remove_from_chroma(
|
20 |
+
document_name: str, collection_name: str = "rag_collection"
|
21 |
+
) -> bool:
|
22 |
+
"""
|
23 |
+
Remove a document and its chunks from the Chroma vector database.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
document_name (str): The base name of the document (without .pdf extension)
|
27 |
+
collection_name (str): Name of the collection in ChromaDB (default: "rag_collection")
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
bool: True if successful, False otherwise
|
31 |
+
"""
|
32 |
+
try:
|
33 |
+
logger.info(f"Attempting to remove document '{document_name}' from Chroma DB")
|
34 |
+
|
35 |
+
# Check if Chroma DB exists
|
36 |
+
chroma_path = "./chroma_db"
|
37 |
+
if not os.path.exists(chroma_path):
|
38 |
+
logger.warning("Chroma DB directory does not exist")
|
39 |
+
return False
|
40 |
+
|
41 |
+
# Initialize embedding function and Chroma client
|
42 |
+
embedding_function = OpenAIEmbeddingFunction(
|
43 |
+
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
|
44 |
+
)
|
45 |
+
|
46 |
+
# Connect to the persistent client
|
47 |
+
client = chromadb.PersistentClient(path=chroma_path)
|
48 |
+
|
49 |
+
# Get the collection
|
50 |
+
try:
|
51 |
+
collection = client.get_collection(
|
52 |
+
name=collection_name, embedding_function=embedding_function
|
53 |
+
)
|
54 |
+
except Exception as e:
|
55 |
+
logger.error(f"Collection '{collection_name}' not found: {e}")
|
56 |
+
return False
|
57 |
+
|
58 |
+
# Delete documents where source_file matches the document_name
|
59 |
+
try:
|
60 |
+
# First, get the IDs of chunks belonging to this document
|
61 |
+
results = collection.get(where={"source_file": document_name})
|
62 |
+
|
63 |
+
ids_to_delete = results.get("ids", [])
|
64 |
+
|
65 |
+
if not ids_to_delete:
|
66 |
+
logger.warning(
|
67 |
+
f"No chunks found for document '{document_name}' in collection"
|
68 |
+
)
|
69 |
+
return True # Nothing to delete, so consider it successful
|
70 |
+
|
71 |
+
# Delete chunks by IDs
|
72 |
+
collection.delete(ids=ids_to_delete)
|
73 |
+
|
74 |
+
logger.info(
|
75 |
+
f"Successfully deleted {len(ids_to_delete)} chunks for '{document_name}' from ChromaDB"
|
76 |
+
)
|
77 |
+
return True
|
78 |
+
|
79 |
+
except Exception as e:
|
80 |
+
logger.error(f"Error deleting chunks from collection: {e}")
|
81 |
+
return False
|
82 |
+
|
83 |
+
except Exception as e:
|
84 |
+
logger.error(f"Error removing document from Chroma DB: {e}")
|
85 |
+
return False
|
86 |
+
|
87 |
+
|
88 |
+
def delete_all_from_chroma(collection_name: str = "rag_collection") -> bool:
|
89 |
+
"""
|
90 |
+
Delete all documents from the specified ChromaDB collection.
|
91 |
+
|
92 |
+
Args:
|
93 |
+
collection_name (str): Name of the collection in ChromaDB (default: "rag_collection")
|
94 |
+
|
95 |
+
Returns:
|
96 |
+
bool: True if successful, False otherwise
|
97 |
+
"""
|
98 |
+
try:
|
99 |
+
# Initialize embedding function and Chroma client
|
100 |
+
embedding_function = OpenAIEmbeddingFunction(
|
101 |
+
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
|
102 |
+
)
|
103 |
+
|
104 |
+
# Connect to the persistent client
|
105 |
+
client = chromadb.PersistentClient(path="./chroma_db")
|
106 |
+
|
107 |
+
try:
|
108 |
+
# Get the collection
|
109 |
+
collection = client.get_collection(
|
110 |
+
name=collection_name, embedding_function=embedding_function
|
111 |
+
)
|
112 |
+
|
113 |
+
# Delete all documents in the collection
|
114 |
+
collection.delete()
|
115 |
+
logger.info(
|
116 |
+
f"Successfully deleted all documents from collection '{collection_name}'"
|
117 |
+
)
|
118 |
+
return True
|
119 |
+
|
120 |
+
except Exception as e:
|
121 |
+
logger.error(f"Error accessing or deleting collection: {e}")
|
122 |
+
return False
|
123 |
+
|
124 |
+
except Exception as e:
|
125 |
+
logger.error(f"Error connecting to Chroma DB: {e}")
|
126 |
+
return False
|
127 |
+
|
128 |
+
|
129 |
+
def reset_chroma_db() -> bool:
|
130 |
+
"""
|
131 |
+
Reset the entire Chroma database by deleting and recreating the directory.
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
bool: True if successful, False otherwise
|
135 |
+
"""
|
136 |
+
try:
|
137 |
+
chroma_path = "./chroma_db"
|
138 |
+
if os.path.exists(chroma_path):
|
139 |
+
# Delete the entire Chroma directory
|
140 |
+
shutil.rmtree(chroma_path)
|
141 |
+
logger.info("Successfully deleted entire Chroma DB directory")
|
142 |
+
|
143 |
+
# Create an empty directory
|
144 |
+
os.makedirs(chroma_path, exist_ok=True)
|
145 |
+
logger.info("Created fresh Chroma DB directory")
|
146 |
+
return True
|
147 |
+
else:
|
148 |
+
logger.warning("Chroma DB directory does not exist")
|
149 |
+
return False
|
150 |
+
except Exception as e:
|
151 |
+
logger.error(f"Error resetting Chroma DB: {e}")
|
152 |
+
return False
|
chroma_operations/ingestion.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import logging
|
4 |
+
from datetime import datetime
|
5 |
+
import hashlib
|
6 |
+
from typing import List
|
7 |
+
import chromadb
|
8 |
+
import openai
|
9 |
+
import os
|
10 |
+
import shutil
|
11 |
+
import logging
|
12 |
+
from datetime import datetime
|
13 |
+
import hashlib
|
14 |
+
from typing import List, Optional
|
15 |
+
import chromadb
|
16 |
+
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
|
17 |
+
from dotenv import load_dotenv
|
18 |
+
|
19 |
+
load_dotenv()
|
20 |
+
|
21 |
+
|
22 |
+
from semantic_chunking import semantic_chunking
|
23 |
+
from chroma_operations.pdf_processing import extract_pdf_content
|
24 |
+
|
25 |
+
# Configure logging
|
26 |
+
logging.basicConfig(
|
27 |
+
level=logging.INFO,
|
28 |
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
29 |
+
handlers=[
|
30 |
+
logging.FileHandler("certification_processing.log"),
|
31 |
+
logging.StreamHandler(),
|
32 |
+
],
|
33 |
+
)
|
34 |
+
logger = logging.getLogger(__name__)
|
35 |
+
|
36 |
+
|
37 |
+
def generate_chunk_id(file: str, chunk: str, position: int) -> str:
|
38 |
+
unique_str = f"{file}_{position}_{chunk}"
|
39 |
+
return hashlib.sha256(unique_str.encode()).hexdigest()
|
40 |
+
|
41 |
+
|
42 |
+
def move_processed_file(source_path: str, destination_dir: str) -> bool:
|
43 |
+
"""Move a processed file to the destination directory."""
|
44 |
+
try:
|
45 |
+
if not os.path.exists(destination_dir):
|
46 |
+
os.makedirs(destination_dir)
|
47 |
+
destination_path = os.path.join(destination_dir, os.path.basename(source_path))
|
48 |
+
os.rename(source_path, destination_path)
|
49 |
+
return True
|
50 |
+
except Exception as e:
|
51 |
+
logger.error(f"Error moving file {source_path}: {str(e)}")
|
52 |
+
return False
|
53 |
+
|
54 |
+
|
55 |
+
def get_chroma_client():
|
56 |
+
"""Initialize ChromaDB client with OpenAI embeddings."""
|
57 |
+
try:
|
58 |
+
# Initialize embedding function
|
59 |
+
embedding_function = OpenAIEmbeddingFunction(
|
60 |
+
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
|
61 |
+
)
|
62 |
+
|
63 |
+
client = chromadb.PersistentClient(path="./chroma_db")
|
64 |
+
logger.info("Successfully connected to ChromaDB with OpenAI embeddings")
|
65 |
+
return client, embedding_function
|
66 |
+
except Exception as e:
|
67 |
+
logger.error(f"Error connecting to ChromaDB: {str(e)}")
|
68 |
+
logger.exception("Detailed stack trace:")
|
69 |
+
return None, None
|
70 |
+
|
71 |
+
|
72 |
+
def create_chroma_collection(
|
73 |
+
client, embedding_function, collection_name="rag_collection"
|
74 |
+
):
|
75 |
+
"""Create or get a ChromaDB collection with proper embedding function."""
|
76 |
+
try:
|
77 |
+
collection = client.get_or_create_collection(
|
78 |
+
name=collection_name,
|
79 |
+
embedding_function=embedding_function,
|
80 |
+
metadata={"description": "medical documents"},
|
81 |
+
)
|
82 |
+
logger.info(f"Initialized collection: {collection_name}")
|
83 |
+
return collection
|
84 |
+
except Exception as e:
|
85 |
+
logger.error(f"Error creating ChromaDB collection: {str(e)}")
|
86 |
+
logger.exception("Detailed stack trace:")
|
87 |
+
return None
|
88 |
+
|
89 |
+
|
90 |
+
def process_file(file_path: str, collection) -> bool:
|
91 |
+
"""Process a file and add to ChromaDB collection."""
|
92 |
+
try:
|
93 |
+
if file_path.endswith(".pdf"):
|
94 |
+
contents = extract_pdf_content(file_path)
|
95 |
+
else:
|
96 |
+
logger.warning(f"Skipping unsupported file type: {file_path}")
|
97 |
+
return False
|
98 |
+
|
99 |
+
if not contents:
|
100 |
+
logger.warning(f"No content extracted from file: {file_path}")
|
101 |
+
return False
|
102 |
+
|
103 |
+
chunks: List[str] = []
|
104 |
+
for content in contents:
|
105 |
+
if not content.strip():
|
106 |
+
continue
|
107 |
+
if "\t" in content or "[TABLE]" in content:
|
108 |
+
chunks.append(content)
|
109 |
+
else:
|
110 |
+
try:
|
111 |
+
chunks.extend(semantic_chunking(content))
|
112 |
+
except Exception as e:
|
113 |
+
logger.error(
|
114 |
+
f"Error during chunking for file {file_path}: {str(e)}"
|
115 |
+
)
|
116 |
+
continue
|
117 |
+
|
118 |
+
if not chunks:
|
119 |
+
logger.warning(f"No valid chunks created for file: {file_path}")
|
120 |
+
return False
|
121 |
+
|
122 |
+
documents = []
|
123 |
+
metadatas = []
|
124 |
+
ids = []
|
125 |
+
|
126 |
+
source_file = os.path.basename(file_path)
|
127 |
+
if source_file.endswith(".pdf"):
|
128 |
+
source_file = source_file[:-4].strip()
|
129 |
+
|
130 |
+
for i, chunk in enumerate(chunks):
|
131 |
+
if not chunk.strip():
|
132 |
+
continue
|
133 |
+
|
134 |
+
try:
|
135 |
+
chunk_id = generate_chunk_id(file_path, chunk, i)
|
136 |
+
documents.append(chunk)
|
137 |
+
metadatas.append(
|
138 |
+
{
|
139 |
+
"chunk_id": chunk_id,
|
140 |
+
"source_file": source_file,
|
141 |
+
"position": i,
|
142 |
+
"processed_at": datetime.now().isoformat(),
|
143 |
+
}
|
144 |
+
)
|
145 |
+
ids.append(chunk_id)
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
logger.error(f"Error processing chunk from file {file_path}: {str(e)}")
|
149 |
+
continue
|
150 |
+
|
151 |
+
if documents:
|
152 |
+
try:
|
153 |
+
# Chroma will automatically generate embeddings using the collection's embedding function
|
154 |
+
collection.add(documents=documents, metadatas=metadatas, ids=ids)
|
155 |
+
logger.info(
|
156 |
+
f"Added {len(documents)} chunks from {file_path} to ChromaDB"
|
157 |
+
)
|
158 |
+
return True
|
159 |
+
except Exception as e:
|
160 |
+
logger.error(f"Error adding documents to ChromaDB: {str(e)}")
|
161 |
+
return False
|
162 |
+
return False
|
163 |
+
|
164 |
+
except Exception as e:
|
165 |
+
logger.error(f"Error processing file {file_path}: {str(e)}")
|
166 |
+
return False
|
167 |
+
|
168 |
+
|
169 |
+
def ingest():
|
170 |
+
try:
|
171 |
+
# Get client and embedding function together
|
172 |
+
chroma_client, embedding_function = get_chroma_client()
|
173 |
+
if not chroma_client or not embedding_function:
|
174 |
+
logger.error("Failed to initialize ChromaDB with embeddings")
|
175 |
+
return False
|
176 |
+
|
177 |
+
collection = create_chroma_collection(chroma_client, embedding_function)
|
178 |
+
if not collection:
|
179 |
+
logger.error("Failed to create or get ChromaDB collection")
|
180 |
+
return False
|
181 |
+
|
182 |
+
logger.info(f"Collection ready: {collection.name}")
|
183 |
+
|
184 |
+
unprocessed_dir = "docs/unprocessed"
|
185 |
+
processed_dir = "docs/processed"
|
186 |
+
|
187 |
+
if not os.path.exists(unprocessed_dir):
|
188 |
+
logger.error(f"Directory not found: {unprocessed_dir}")
|
189 |
+
return False
|
190 |
+
|
191 |
+
for file in os.listdir(unprocessed_dir):
|
192 |
+
file_path = os.path.join(unprocessed_dir, file)
|
193 |
+
|
194 |
+
if not os.path.isfile(file_path) or not file.lower().endswith(".pdf"):
|
195 |
+
continue
|
196 |
+
|
197 |
+
logger.info(f"Processing file: {file_path}")
|
198 |
+
|
199 |
+
if process_file(file_path, collection):
|
200 |
+
if not move_processed_file(file_path, processed_dir):
|
201 |
+
logger.error(f"Failed to move processed file: {file_path}")
|
202 |
+
else:
|
203 |
+
logger.error(f"Failed to process file: {file_path}")
|
204 |
+
|
205 |
+
logger.info("Processing completed")
|
206 |
+
return True
|
207 |
+
|
208 |
+
except Exception as e:
|
209 |
+
logger.error(f"Fatal error in ingestion: {str(e)}")
|
210 |
+
logger.exception("Detailed stack trace:")
|
211 |
+
return False
|
chroma_operations/pdf_processing.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pdfplumber
|
2 |
+
import logging
|
3 |
+
from typing import List, Union, Tuple
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
# Set up logging
|
8 |
+
logging.basicConfig(level=logging.INFO)
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
|
12 |
+
def extract_pdf_content(pdf_path: str) -> List[str]:
|
13 |
+
"""
|
14 |
+
Extract text and tables from PDF in their natural reading order.
|
15 |
+
Simplified version without positional processing.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
pdf_path (str): Path to the PDF file
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
List[str]: List of extracted content chunks (text and tables)
|
22 |
+
"""
|
23 |
+
if not os.path.exists(pdf_path):
|
24 |
+
logger.error(f"PDF file not found: {pdf_path}")
|
25 |
+
return []
|
26 |
+
|
27 |
+
try:
|
28 |
+
with pdfplumber.open(pdf_path) as pdf:
|
29 |
+
content = []
|
30 |
+
|
31 |
+
for page in pdf.pages:
|
32 |
+
# First extract tables
|
33 |
+
tables = page.extract_tables()
|
34 |
+
for table in tables:
|
35 |
+
if table:
|
36 |
+
# Convert table to string representation
|
37 |
+
table_str = "\n".join(
|
38 |
+
["\t".join(str(cell) for cell in row) for row in table]
|
39 |
+
)
|
40 |
+
content.append(f"[TABLE]\n{table_str}\n[/TABLE]")
|
41 |
+
|
42 |
+
# Then extract regular text
|
43 |
+
text = page.extract_text()
|
44 |
+
if text and text.strip():
|
45 |
+
content.append(text.strip())
|
46 |
+
|
47 |
+
logger.info(f"Successfully extracted content from {pdf_path}")
|
48 |
+
return content
|
49 |
+
|
50 |
+
except Exception as e:
|
51 |
+
logger.error(f"Error processing {pdf_path}: {str(e)}")
|
52 |
+
return []
|
chroma_operations/retrieve.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from typing import List, Optional
|
4 |
+
import chromadb
|
5 |
+
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
|
8 |
+
# Load environment variables
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
# Setup logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
|
16 |
+
def search_similar_chunks(
|
17 |
+
query_text: str,
|
18 |
+
document_name: str,
|
19 |
+
collection_name: str = "rag_collection",
|
20 |
+
top_k: int = 5,
|
21 |
+
):
|
22 |
+
"""Search for top-k chunks similar to query_text within a specific document (source_file)."""
|
23 |
+
try:
|
24 |
+
# Initialize embedding function and Chroma client
|
25 |
+
embedding_function = OpenAIEmbeddingFunction(
|
26 |
+
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
|
27 |
+
)
|
28 |
+
client = chromadb.PersistentClient(path="./chroma_db")
|
29 |
+
|
30 |
+
# Load the collection
|
31 |
+
collection = client.get_collection(
|
32 |
+
name=collection_name, embedding_function=embedding_function
|
33 |
+
)
|
34 |
+
|
35 |
+
# Query similar documents filtered by document_name
|
36 |
+
results = collection.query(
|
37 |
+
query_texts=[query_text],
|
38 |
+
n_results=top_k,
|
39 |
+
where={"source_file": document_name},
|
40 |
+
)
|
41 |
+
|
42 |
+
documents = results.get("documents", [[]])[0]
|
43 |
+
metadatas = results.get("metadatas", [[]])[0]
|
44 |
+
|
45 |
+
return documents
|
46 |
+
|
47 |
+
except Exception as e:
|
48 |
+
logger.error(f"Similarity search failed: {str(e)}")
|
49 |
+
return []
|
llms.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from langchain.prompts import PromptTemplate
|
4 |
+
from langchain_groq import ChatGroq
|
5 |
+
from typing import Literal
|
6 |
+
|
7 |
+
# Load environment variables
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
|
11 |
+
# Initialize LLMs
|
12 |
+
def initialize_llms():
|
13 |
+
"""Initialize and return the LLM instances"""
|
14 |
+
groq_api_key = os.getenv("GROQ_API_KEY")
|
15 |
+
|
16 |
+
return {
|
17 |
+
"llm": ChatGroq(
|
18 |
+
temperature=0.1, model="llama-3.3-70b-versatile", api_key=groq_api_key
|
19 |
+
),
|
20 |
+
"step_back_llm": ChatGroq(
|
21 |
+
temperature=0, model="Gemma2-9B-IT", api_key=groq_api_key
|
22 |
+
),
|
23 |
+
}
|
24 |
+
|
25 |
+
|
26 |
+
# Query refinement
|
27 |
+
def refine_query(query: str, llm: ChatGroq) -> str:
|
28 |
+
"""Enhance pediatric medicine queries for better retrieval while preserving clinical intent"""
|
29 |
+
template = """
|
30 |
+
You are a medical language expert. Your task is to improve the following user question by:
|
31 |
+
|
32 |
+
- Correcting any grammatical or spelling errors
|
33 |
+
- Clarifying vague or ambiguous wording
|
34 |
+
- Improving sentence structure for readability and precision
|
35 |
+
- Maintaining the original meaning and clinical focus
|
36 |
+
|
37 |
+
Do not add new information. Do not expand abbreviations unless they are unclear. Do not include any commentary or explanation.
|
38 |
+
|
39 |
+
Original query: {original_query}
|
40 |
+
|
41 |
+
Improved medical question:
|
42 |
+
"""
|
43 |
+
|
44 |
+
prompt = PromptTemplate(input_variables=["original_query"], template=template)
|
45 |
+
|
46 |
+
chain = prompt | llm
|
47 |
+
return chain.invoke({"original_query": query}).content
|
48 |
+
|
49 |
+
|
50 |
+
def query_to_retrieve(query, llm):
|
51 |
+
"""Convert a query to a format suitable for retrieval"""
|
52 |
+
template = """
|
53 |
+
You are an expert in pediatric medical information retrieval.
|
54 |
+
|
55 |
+
Your task is to rewrite the following question into a single, concise sentence containing only the most relevant medical and pediatric concepts. This sentence will be used for semantic search in a vector database.
|
56 |
+
|
57 |
+
Instructions:
|
58 |
+
- Include only the core clinical focus (conditions, symptoms, treatments, procedures).
|
59 |
+
- Mention pediatric-specific entities if relevant (e.g., age group, child-specific medication).
|
60 |
+
- Remove all conversational language and filler.
|
61 |
+
- Preserve the original intent.
|
62 |
+
- Output only one clean, search-optimized sentence.
|
63 |
+
|
64 |
+
Original query: {original_query}
|
65 |
+
|
66 |
+
Search-ready query:
|
67 |
+
"""
|
68 |
+
|
69 |
+
prompt = PromptTemplate(input_variables=["original_query"], template=template)
|
70 |
+
|
71 |
+
chain = prompt | llm
|
72 |
+
return chain.invoke({"original_query": query}).content
|
73 |
+
|
74 |
+
|
75 |
+
def answer_query_with_chunks(
|
76 |
+
query: str,
|
77 |
+
retrieved_docs,
|
78 |
+
llm: ChatGroq,
|
79 |
+
) -> str:
|
80 |
+
try:
|
81 |
+
# Embed query using the same embedding function
|
82 |
+
query_improved = refine_query(query, llm)
|
83 |
+
|
84 |
+
if not retrieved_docs:
|
85 |
+
return "Sorry, no relevant medical information was found."
|
86 |
+
|
87 |
+
# Construct context for the LLM
|
88 |
+
context = "\n\n".join(retrieved_docs)
|
89 |
+
|
90 |
+
system_prompt = """
|
91 |
+
You are a pediatric medical assistant.
|
92 |
+
|
93 |
+
Based only on the provided context, answer the user's question concisely and accurately but with the necessary explanation.
|
94 |
+
If the answer is not present in the context, say: "The answer is not available in the current documents."
|
95 |
+
|
96 |
+
Context:
|
97 |
+
{context}
|
98 |
+
|
99 |
+
User question:
|
100 |
+
{query}
|
101 |
+
|
102 |
+
Answer:
|
103 |
+
"""
|
104 |
+
|
105 |
+
prompt = PromptTemplate(
|
106 |
+
input_variables=["context", "query"],
|
107 |
+
template=system_prompt,
|
108 |
+
)
|
109 |
+
|
110 |
+
chain = prompt | llm
|
111 |
+
return chain.invoke({"context": context, "query": query_improved}).content
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
return f"An error occurred while answering the query: {str(e)}"
|
rag.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from llms import initialize_llms, answer_query_with_chunks, query_to_retrieve
|
4 |
+
from chroma_operations.retrieve import search_similar_chunks
|
5 |
+
|
6 |
+
# Initialize FastAPI
|
7 |
+
app = FastAPI()
|
8 |
+
|
9 |
+
|
10 |
+
# Define request model
|
11 |
+
class RAGRequest(BaseModel):
|
12 |
+
query_text: str
|
13 |
+
file_name: str
|
14 |
+
collection_name: str = "rag_collection"
|
15 |
+
|
16 |
+
# Load LLM once at startup
|
17 |
+
llms = initialize_llms()
|
18 |
+
llm = llms["llm"]
|
19 |
+
|
20 |
+
|
21 |
+
@app.post("/ask")
|
22 |
+
def ask_question(query_text, file_name, collection_name="rag_collection"):
|
23 |
+
try:
|
24 |
+
query_search = query_to_retrieve(query_text, llm)
|
25 |
+
retrieved_docs = search_similar_chunks(
|
26 |
+
query_search, file_name, collection_name
|
27 |
+
)
|
28 |
+
|
29 |
+
if not retrieved_docs:
|
30 |
+
raise HTTPException(status_code=404, detail="No matching documents found.")
|
31 |
+
|
32 |
+
answer = answer_query_with_chunks(query_text, retrieved_docs, llm)
|
33 |
+
return {"answer": answer, "chunks": retrieved_docs}
|
34 |
+
|
35 |
+
except Exception as e:
|
36 |
+
raise HTTPException(status_code=500, detail=str(e))
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sentence-transformers
|
2 |
+
pdfplumber
|
3 |
+
langchain
|
4 |
+
langchain-community
|
5 |
+
chroma
|
6 |
+
openai
|
7 |
+
streamlit
|
8 |
+
requests
|
9 |
+
chromadb
|
10 |
+
langchain-groq
|
semantic_chunking.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
|
4 |
+
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
5 |
+
|
6 |
+
|
7 |
+
def hybrid_split(text: str, max_len: int = 1024) -> list[str]:
|
8 |
+
"""
|
9 |
+
Split text into chunks respecting sentence boundaries when possible,
|
10 |
+
with optional overlap between chunks.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
text: The text to split
|
14 |
+
max_len: Maximum length for each chunk
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
List of text chunks
|
18 |
+
"""
|
19 |
+
# Normalize text
|
20 |
+
text = text.replace("\r", "").replace("\n", " ").strip()
|
21 |
+
|
22 |
+
# Extract sentences (more robust regex for sentence detection)
|
23 |
+
import re
|
24 |
+
|
25 |
+
sentences = re.split(r"(?<=[.!?])\s+", text)
|
26 |
+
|
27 |
+
chunks = []
|
28 |
+
current_chunk = ""
|
29 |
+
|
30 |
+
for sentence in sentences:
|
31 |
+
if len(sentence) > max_len:
|
32 |
+
# First add the current chunk if it exists
|
33 |
+
chunks.append(sentence)
|
34 |
+
|
35 |
+
# Normal case - see if adding the sentence exceeds max_len
|
36 |
+
elif len(current_chunk) + len(sentence) + 1 > max_len:
|
37 |
+
# Add the current chunk and start a new one
|
38 |
+
chunks.append(current_chunk)
|
39 |
+
current_chunk = ""
|
40 |
+
else:
|
41 |
+
# Add to the current chunk
|
42 |
+
if current_chunk:
|
43 |
+
current_chunk += " " + sentence
|
44 |
+
else:
|
45 |
+
current_chunk = sentence
|
46 |
+
|
47 |
+
if current_chunk:
|
48 |
+
chunks.append(current_chunk)
|
49 |
+
|
50 |
+
return chunks
|
51 |
+
|
52 |
+
|
53 |
+
def cosine_similarity(vec1, vec2):
|
54 |
+
"""Calculate the cosine similarity between two vectors."""
|
55 |
+
dot_product = np.dot(vec1, vec2)
|
56 |
+
norm_vec1 = np.linalg.norm(vec1)
|
57 |
+
norm_vec2 = np.linalg.norm(vec2)
|
58 |
+
return dot_product / (norm_vec1 * norm_vec2)
|
59 |
+
|
60 |
+
|
61 |
+
def get_embedding(text):
|
62 |
+
"""Generate an embedding using SBERT."""
|
63 |
+
return embedding_model.encode(text, convert_to_numpy=True)
|
64 |
+
|
65 |
+
|
66 |
+
def semantic_chunking(text, threshold=0.75, max_chunk_size=8191):
|
67 |
+
"""
|
68 |
+
Splits text into semantic chunks based on sentence similarity.
|
69 |
+
- threshold: Lower = more splits, Higher = fewer splits
|
70 |
+
- max_chunk_size: Maximum size of each chunk in characters
|
71 |
+
"""
|
72 |
+
text = text.replace("\n", " ").replace("\r", " ").strip()
|
73 |
+
sentences = hybrid_split(text)
|
74 |
+
embeddings = [get_embedding(sent) for sent in sentences]
|
75 |
+
|
76 |
+
chunks = []
|
77 |
+
current_chunk = [sentences[0]]
|
78 |
+
|
79 |
+
for i in range(1, len(sentences)):
|
80 |
+
sim = cosine_similarity(embeddings[i - 1], embeddings[i])
|
81 |
+
if (
|
82 |
+
sim < threshold
|
83 |
+
or len(" ".join(current_chunk + [sentences[i]])) > max_chunk_size
|
84 |
+
):
|
85 |
+
chunks.append(" ".join(current_chunk))
|
86 |
+
current_chunk = [sentences[i]]
|
87 |
+
else:
|
88 |
+
current_chunk.append(sentences[i])
|
89 |
+
|
90 |
+
if current_chunk:
|
91 |
+
chunks.append(" ".join(current_chunk))
|
92 |
+
|
93 |
+
return chunks
|