Spaces:
Build error
Build error
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- .cache/huggingface/.gitignore +1 -0
- .cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.lock +0 -0
- .cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.metadata +3 -0
- .env +4 -0
- .gitattributes +155 -0
- .github/workflows/update_space.yml +28 -0
- .gitignore +7 -0
- .streamlit/secrets.toml +3 -0
- Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf +0 -0
- Data/cancer_and_cure__a_critical_analysis.27.pdf +0 -0
- Data/medical_oncology_handbook_june_2020_edition.pdf +0 -0
- DockerFile +20 -0
- MultimodalRAG.ipynb +0 -0
- MultimodalRAGUpdatedVersion.ipynb +0 -0
- README.md +125 -8
- Streaming.py +223 -0
- Streamingnewversion.py +244 -0
- __pycache__/app.cpython-310.pyc +0 -0
- __pycache__/clip_helpers.cpython-310.pyc +0 -0
- __pycache__/combinedmultimodal.cpython-310.pyc +0 -0
- __pycache__/imagebind.cpython-310.pyc +0 -0
- __pycache__/images.cpython-310.pyc +0 -0
- __pycache__/ingest.cpython-310.pyc +0 -0
- app.py +83 -0
- app1.py +119 -0
- combinedmultimodal.py +621 -0
- freeze +0 -0
- images.py +12 -0
- images/architecture.png +0 -0
- images/figure-1-1.jpg +0 -0
- images/figure-1-10.jpg +0 -0
- images/figure-1-11.jpg +0 -0
- images/figure-1-2.jpg +0 -0
- images/figure-1-3.jpg +0 -0
- images/figure-1-4.jpg +0 -0
- images/figure-1-5.jpg +0 -0
- images/figure-1-6.jpg +0 -0
- images/figure-1-7.jpg +0 -0
- images/figure-1-8.jpg +0 -0
- images/figure-1-9.jpg +0 -0
- images/multimodal.png +3 -0
- images1/figure-1-1.jpg +0 -0
- images1/figure-1-10.jpg +0 -0
- images1/figure-1-11.jpg +0 -0
- images1/figure-1-2.jpg +0 -0
- images1/figure-1-3.jpg +0 -0
- images1/figure-1-4.jpg +0 -0
- images1/figure-1-5.jpg +0 -0
- images1/figure-1-6.jpg +0 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.cache/huggingface/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*
|
.cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.lock
ADDED
|
File without changes
|
.cache/huggingface/download/openbiollm-llama3-8b.Q5_K_M.gguf.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
d1248c48f0ade670847d05fb2cb356a75df4db3a
|
| 2 |
+
1753c629bf99c261e8b92498d813f382f811e903cdc0e685a11d1689612b34ce
|
| 3 |
+
1723860909.403446
|
.env
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
QDRANT_URL=https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333
|
| 2 |
+
QDRANT_API_KEY=REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA
|
| 3 |
+
NVIDIA_API_KEY=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
|
| 4 |
+
GEMINI_API_KEY=AIzaSyCXGnm-n6aF962jeorkjo2IsMCwxDwj4bo
|
.gitattributes
CHANGED
|
@@ -33,3 +33,158 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
images/multimodal.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
multimodal.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
myenv/bin/python filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
myenv/bin/python3 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
myenv/bin/python3.10 filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
myenv/bin/ruff filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
myenv/lib/python3.10/site-packages/Cython/Compiler/Code.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
myenv/lib/python3.10/site-packages/PIL/.dylibs/libfreetype.6.dylib filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
myenv/lib/python3.10/site-packages/PIL/.dylibs/libharfbuzz.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
myenv/lib/python3.10/site-packages/_soundfile_data/libsndfile_x86_64.dylib filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
myenv/lib/python3.10/site-packages/altair/vegalite/v5/schema/__pycache__/channels.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
myenv/lib/python3.10/site-packages/altair/vegalite/v5/schema/__pycache__/core.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libaom.3.2.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libavcodec.60.31.102.dylib filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libavfilter.9.12.100.dylib filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libavformat.60.16.100.dylib filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libdav1d.7.dylib filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libfreetype.6.dylib filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libharfbuzz.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libswscale.7.5.100.dylib filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libvpx.9.dylib filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
myenv/lib/python3.10/site-packages/av/.dylibs/libxml2.2.dylib filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
myenv/lib/python3.10/site-packages/cmake/data/bin/ccmake filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
myenv/lib/python3.10/site-packages/cmake/data/bin/cmake filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
myenv/lib/python3.10/site-packages/cmake/data/bin/cpack filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
myenv/lib/python3.10/site-packages/cmake/data/bin/ctest filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
myenv/lib/python3.10/site-packages/cmake/data/doc/cmake/CMake.qch filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
myenv/lib/python3.10/site-packages/cryptography/hazmat/bindings/_rust.abi3.so filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx/libctransformers.so filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/avx2/libctransformers.so filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/basic/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.dylib filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/basic/libctransformers.so filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/cuda/ctransformers.dll filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
myenv/lib/python3.10/site-packages/ctransformers/lib/cuda/libctransformers.so filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libSvtAv1Enc.1.8.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libX11.6.dylib filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libaom.3.8.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libavcodec.60.31.102.dylib filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libavformat.60.16.100.dylib filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libcrypto.3.dylib filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libdav1d.7.dylib filter=lfs diff=lfs merge=lfs -text
|
| 84 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libgnutls.30.dylib filter=lfs diff=lfs merge=lfs -text
|
| 85 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libjxl.0.9.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 86 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libp11-kit.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 87 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/librav1e.0.6.6.dylib filter=lfs diff=lfs merge=lfs -text
|
| 88 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libunistring.5.dylib filter=lfs diff=lfs merge=lfs -text
|
| 89 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libvpx.8.dylib filter=lfs diff=lfs merge=lfs -text
|
| 90 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
|
| 91 |
+
myenv/lib/python3.10/site-packages/cv2/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
|
| 92 |
+
myenv/lib/python3.10/site-packages/cv2/cv2.abi3.so filter=lfs diff=lfs merge=lfs -text
|
| 93 |
+
myenv/lib/python3.10/site-packages/decord/.dylibs/libavcodec.58.35.100.dylib filter=lfs diff=lfs merge=lfs -text
|
| 94 |
+
myenv/lib/python3.10/site-packages/decord/.dylibs/libavfilter.7.40.101.dylib filter=lfs diff=lfs merge=lfs -text
|
| 95 |
+
myenv/lib/python3.10/site-packages/decord/.dylibs/libavformat.58.20.100.dylib filter=lfs diff=lfs merge=lfs -text
|
| 96 |
+
myenv/lib/python3.10/site-packages/decord/.dylibs/libvpx.8.dylib filter=lfs diff=lfs merge=lfs -text
|
| 97 |
+
myenv/lib/python3.10/site-packages/decord/.dylibs/libx264.164.dylib filter=lfs diff=lfs merge=lfs -text
|
| 98 |
+
myenv/lib/python3.10/site-packages/decord/libdecord.dylib filter=lfs diff=lfs merge=lfs -text
|
| 99 |
+
myenv/lib/python3.10/site-packages/emoji/unicode_codes/__pycache__/data_dict.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
|
| 100 |
+
myenv/lib/python3.10/site-packages/gradio/frpc_darwin_amd64_v0.2 filter=lfs diff=lfs merge=lfs -text
|
| 101 |
+
myenv/lib/python3.10/site-packages/grpc/_cython/cygrpc.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 102 |
+
myenv/lib/python3.10/site-packages/grpc_tools/_protoc_compiler.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 103 |
+
myenv/lib/python3.10/site-packages/layoutparser/misc/NotoSerifCJKjp-Regular.otf filter=lfs diff=lfs merge=lfs -text
|
| 104 |
+
myenv/lib/python3.10/site-packages/lib/libllama.dylib filter=lfs diff=lfs merge=lfs -text
|
| 105 |
+
myenv/lib/python3.10/site-packages/llama_cpp/libllama.dylib filter=lfs diff=lfs merge=lfs -text
|
| 106 |
+
myenv/lib/python3.10/site-packages/lxml/etree.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 107 |
+
myenv/lib/python3.10/site-packages/lxml/objectify.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 108 |
+
myenv/lib/python3.10/site-packages/magic/libmagic/magic.mgc filter=lfs diff=lfs merge=lfs -text
|
| 109 |
+
myenv/lib/python3.10/site-packages/minijinja/_lowlevel.abi3.so filter=lfs diff=lfs merge=lfs -text
|
| 110 |
+
myenv/lib/python3.10/site-packages/numpy/.dylibs/libgfortran.5.dylib filter=lfs diff=lfs merge=lfs -text
|
| 111 |
+
myenv/lib/python3.10/site-packages/numpy/.dylibs/libopenblas64_.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 112 |
+
myenv/lib/python3.10/site-packages/numpy/core/_multiarray_umath.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 113 |
+
myenv/lib/python3.10/site-packages/numpy/core/_simd.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 114 |
+
myenv/lib/python3.10/site-packages/onnx/onnx_cpp2py_export.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 115 |
+
myenv/lib/python3.10/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so filter=lfs diff=lfs merge=lfs -text
|
| 116 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/algos.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 117 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/groupby.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 118 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/hashtable.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 119 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/interval.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 120 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/join.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 121 |
+
myenv/lib/python3.10/site-packages/pandas/_libs/tslibs/offsets.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 122 |
+
myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libgnutls.30.dylib filter=lfs diff=lfs merge=lfs -text
|
| 123 |
+
myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libp11-kit.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 124 |
+
myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libqpdf.29.8.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 125 |
+
myenv/lib/python3.10/site-packages/pikepdf/.dylibs/libunistring.5.dylib filter=lfs diff=lfs merge=lfs -text
|
| 126 |
+
myenv/lib/python3.10/site-packages/pikepdf/_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 127 |
+
myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libaom.3.8.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 128 |
+
myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libjxl.0.8.2.dylib filter=lfs diff=lfs merge=lfs -text
|
| 129 |
+
myenv/lib/python3.10/site-packages/pillow_heif/.dylibs/libx265.199.dylib filter=lfs diff=lfs merge=lfs -text
|
| 130 |
+
myenv/lib/python3.10/site-packages/pyarrow/_compute.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 131 |
+
myenv/lib/python3.10/site-packages/pyarrow/_dataset.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 132 |
+
myenv/lib/python3.10/site-packages/pyarrow/_flight.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 133 |
+
myenv/lib/python3.10/site-packages/pyarrow/lib.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 134 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
| 135 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow_acero.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
| 136 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow_dataset.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
| 137 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow_flight.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
| 138 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow_python.dylib filter=lfs diff=lfs merge=lfs -text
|
| 139 |
+
myenv/lib/python3.10/site-packages/pyarrow/libarrow_substrait.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
| 140 |
+
myenv/lib/python3.10/site-packages/pyarrow/libparquet.1601.dylib filter=lfs diff=lfs merge=lfs -text
|
| 141 |
+
myenv/lib/python3.10/site-packages/pydantic_core/_pydantic_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 142 |
+
myenv/lib/python3.10/site-packages/pydeck/nbextension/static/index.js.map filter=lfs diff=lfs merge=lfs -text
|
| 143 |
+
myenv/lib/python3.10/site-packages/pypdf/_codecs/__pycache__/adobe_glyphs.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
|
| 144 |
+
myenv/lib/python3.10/site-packages/pypdfium2_raw/libpdfium.dylib filter=lfs diff=lfs merge=lfs -text
|
| 145 |
+
myenv/lib/python3.10/site-packages/rapidfuzz/distance/metrics_cpp.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 146 |
+
myenv/lib/python3.10/site-packages/rapidfuzz/distance/metrics_cpp_avx2.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 147 |
+
myenv/lib/python3.10/site-packages/rapidfuzz/fuzz_cpp.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 148 |
+
myenv/lib/python3.10/site-packages/rapidfuzz/fuzz_cpp_avx2.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 149 |
+
myenv/lib/python3.10/site-packages/safetensors/_safetensors_rust.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 150 |
+
myenv/lib/python3.10/site-packages/scipy/.dylibs/libgfortran.5.dylib filter=lfs diff=lfs merge=lfs -text
|
| 151 |
+
myenv/lib/python3.10/site-packages/scipy/.dylibs/libopenblas.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 152 |
+
myenv/lib/python3.10/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 153 |
+
myenv/lib/python3.10/site-packages/scipy/io/_fast_matrix_market/_fmm_core.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 154 |
+
myenv/lib/python3.10/site-packages/scipy/linalg/_flapack.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 155 |
+
myenv/lib/python3.10/site-packages/scipy/misc/face.dat filter=lfs diff=lfs merge=lfs -text
|
| 156 |
+
myenv/lib/python3.10/site-packages/scipy/optimize/_highs/_highs_wrapper.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 157 |
+
myenv/lib/python3.10/site-packages/scipy/sparse/_sparsetools.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 158 |
+
myenv/lib/python3.10/site-packages/scipy/spatial/_qhull.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 159 |
+
myenv/lib/python3.10/site-packages/scipy/special/_ufuncs.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 160 |
+
myenv/lib/python3.10/site-packages/scipy/special/cython_special.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 161 |
+
myenv/lib/python3.10/site-packages/scipy/stats/_unuran/unuran_wrapper.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 162 |
+
myenv/lib/python3.10/site-packages/sentencepiece/_sentencepiece.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 163 |
+
myenv/lib/python3.10/site-packages/skimage/filters/rank/generic_cy.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 164 |
+
myenv/lib/python3.10/site-packages/sklearn/_loss/_loss.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 165 |
+
myenv/lib/python3.10/site-packages/tiktoken/_tiktoken.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 166 |
+
myenv/lib/python3.10/site-packages/tokenizers/tokenizers.cpython-310-darwin.so filter=lfs diff=lfs merge=lfs -text
|
| 167 |
+
myenv/lib/python3.10/site-packages/torch/.dylibs/libiomp5.dylib filter=lfs diff=lfs merge=lfs -text
|
| 168 |
+
myenv/lib/python3.10/site-packages/torch/bin/protoc filter=lfs diff=lfs merge=lfs -text
|
| 169 |
+
myenv/lib/python3.10/site-packages/torch/bin/protoc-3.13.0.0 filter=lfs diff=lfs merge=lfs -text
|
| 170 |
+
myenv/lib/python3.10/site-packages/torch/lib/libiomp5.dylib filter=lfs diff=lfs merge=lfs -text
|
| 171 |
+
myenv/lib/python3.10/site-packages/torch/lib/libtorch_cpu.dylib filter=lfs diff=lfs merge=lfs -text
|
| 172 |
+
myenv/lib/python3.10/site-packages/torch/lib/libtorch_python.dylib filter=lfs diff=lfs merge=lfs -text
|
| 173 |
+
myenv/lib/python3.10/site-packages/torchaudio/_torchaudio.so filter=lfs diff=lfs merge=lfs -text
|
| 174 |
+
myenv/lib/python3.10/site-packages/torchaudio/lib/libflashlight-text.so filter=lfs diff=lfs merge=lfs -text
|
| 175 |
+
myenv/lib/python3.10/site-packages/torchaudio/lib/libtorchaudio.so filter=lfs diff=lfs merge=lfs -text
|
| 176 |
+
myenv/lib/python3.10/site-packages/torchvision/.dylibs/libc++.1.0.dylib filter=lfs diff=lfs merge=lfs -text
|
| 177 |
+
myenv/lib/python3.10/site-packages/unicorn/lib/libunicorn.2.dylib filter=lfs diff=lfs merge=lfs -text
|
| 178 |
+
myenv/lib/python3.10/site-packages/unicorn/lib/libunicorn.a filter=lfs diff=lfs merge=lfs -text
|
| 179 |
+
myenv/share/jupyter/nbextensions/pydeck/index.js.map filter=lfs diff=lfs merge=lfs -text
|
| 180 |
+
openbiollm-llama3-8b.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
| 181 |
+
path/to/data/collections/image_data/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
|
| 182 |
+
path/to/data/collections/image_data/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
|
| 183 |
+
path/to/data/collections/medical_img/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
|
| 184 |
+
path/to/data/collections/medical_img/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
|
| 185 |
+
qdrant_data/collections/vector_db/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
|
| 186 |
+
qdrant_data/collections/vector_db/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
|
| 187 |
+
qdrant_storage/collections/medical_img/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
|
| 188 |
+
qdrant_storage/collections/medical_img/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
|
| 189 |
+
qdrant_storage/collections/vector_db/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
|
| 190 |
+
qdrant_storage/collections/vector_db/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/update_space.yml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Run Python script
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- surbhi
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
build:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
|
| 12 |
+
steps:
|
| 13 |
+
- name: Checkout
|
| 14 |
+
uses: actions/checkout@v2
|
| 15 |
+
|
| 16 |
+
- name: Set up Python
|
| 17 |
+
uses: actions/setup-python@v2
|
| 18 |
+
with:
|
| 19 |
+
python-version: '3.9'
|
| 20 |
+
|
| 21 |
+
- name: Install Gradio
|
| 22 |
+
run: python -m pip install gradio
|
| 23 |
+
|
| 24 |
+
- name: Log in to Hugging Face
|
| 25 |
+
run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
|
| 26 |
+
|
| 27 |
+
- name: Deploy to Spaces
|
| 28 |
+
run: gradio deploy
|
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
qdrant_data
|
| 2 |
+
myenv
|
| 3 |
+
openbiollm-llama3-8b.Q5_K_M.gguf
|
| 4 |
+
__pycache__
|
| 5 |
+
secrets.toml
|
| 6 |
+
.streamlit/
|
| 7 |
+
.env
|
.streamlit/secrets.toml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# .streamlit/secrets.toml
|
| 2 |
+
QDRANT_URL = "https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333"
|
| 3 |
+
QDRANT_API_KEY = "REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA"
|
Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf
ADDED
|
Binary file (485 kB). View file
|
|
|
Data/cancer_and_cure__a_critical_analysis.27.pdf
ADDED
|
Binary file (226 kB). View file
|
|
|
Data/medical_oncology_handbook_june_2020_edition.pdf
ADDED
|
Binary file (818 kB). View file
|
|
|
DockerFile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use the official Python image from the Docker Hub
|
| 2 |
+
FROM python:3.10
|
| 3 |
+
|
| 4 |
+
# Set the working directory in the container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Copy the requirements file into the container at /app
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Install the required libraries
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Copy the rest of the application code into the container
|
| 14 |
+
COPY . .
|
| 15 |
+
|
| 16 |
+
# Expose the port the app runs on
|
| 17 |
+
EXPOSE 8501
|
| 18 |
+
|
| 19 |
+
# Command to run the application
|
| 20 |
+
CMD ["streamlit", "run", "stream.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
MultimodalRAG.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
MultimodalRAGUpdatedVersion.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
README.md
CHANGED
|
@@ -1,12 +1,129 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
colorFrom: gray
|
| 5 |
-
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 4.
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
---
|
|
|
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Medical_RAG
|
| 3 |
+
app_file: combinedmultimodal.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
+
sdk_version: 4.41.0
|
|
|
|
|
|
|
| 6 |
---
|
| 7 |
+
# Advancing Text Searching with Advanced Indexing Techniques in Healthcare Applications(In Progress)
|
| 8 |
|
| 9 |
+
Welcome to the project repository for advancing text searching with advanced indexing techniques in healthcare applications. This project implements a powerful Retrieval-Augmented Generation (RAG) system using cutting-edge AI technologies, specifically designed to enhance text searching capabilities within the healthcare domain.I have also implemented Multimodal Text Searching for Medical Documents.
|
| 10 |
+
|
| 11 |
+
## 🚀 Features For Text Based Medical Query Based System
|
| 12 |
+
|
| 13 |
+
- **BioLLM 8B**: Advanced language model for generating and processing medical text.
|
| 14 |
+
- **ClinicalBert**: State-of-the-art embedding model for accurate representation of medical texts.
|
| 15 |
+
- **Qdrant**: Self-hosted Vector Database (Vector DB) for efficient storage and retrieval of embeddings.
|
| 16 |
+
- **Langchain & Llama CPP**: Orchestration frameworks for seamless integration and workflow management.
|
| 17 |
+
|
| 18 |
+
# Medical Knowledge Base Query System
|
| 19 |
+
|
| 20 |
+
A multimodal medical information retrieval system combining text and image-based querying for comprehensive medical knowledge access.
|
| 21 |
+
|
| 22 |
+
## Features For Multimodality Medical Query Based System:
|
| 23 |
+
[Watch the video on YouTube](https://youtu.be/pNy7RqfRUrc?si=1HQgq54oHT6YoR0B)
|
| 24 |
+
|
| 25 |
+
### 🧠 Multimodal Medical Information Retrieval
|
| 26 |
+
- Combines text and image-based querying for comprehensive medical knowledge access
|
| 27 |
+
- Uses Qdrant vector database to store and retrieve both text and image embeddings
|
| 28 |
+
|
| 29 |
+
### 🔤 Advanced Natural Language Processing
|
| 30 |
+
- Utilizes ClinicalBERT for domain-specific text embeddings
|
| 31 |
+
- Implements NVIDIA's Palmyra-med-70b model for medical language understanding fast Inference time.
|
| 32 |
+
|
| 33 |
+
### 🖼️ Image Analysis Capabilities
|
| 34 |
+
- Incorporates CLIP (Contrastive Language-Image Pre-training) for image feature extraction
|
| 35 |
+
- Generates image summaries using Google's Gemini 1.5 Flash model
|
| 36 |
+
|
| 37 |
+
### 📄 PDF Processing
|
| 38 |
+
- Extracts text and images from medical PDF documents
|
| 39 |
+
- Implements intelligent chunking strategies for text processing
|
| 40 |
+
|
| 41 |
+
### 🔍 Vector Search
|
| 42 |
+
- Uses Qdrant for efficient similarity search on both text and image vectors
|
| 43 |
+
- Implements hybrid search combining CLIP-based image similarity and text-based summary similarity
|
| 44 |
+
|
| 45 |
+
### 🖥️ Interactive User Interface
|
| 46 |
+
- Gradio-based web interface for easy querying and result visualization
|
| 47 |
+
- Displays relevant text responses alongside related medical images
|
| 48 |
+
|
| 49 |
+
### 🧩 Extensible Architecture
|
| 50 |
+
- Modular design allowing for easy integration of new models or data sources
|
| 51 |
+
- Supports both local and cloud-based model deployment
|
| 52 |
+
The high level architectural framework for this application is given as follows:
|
| 53 |
+

|
| 54 |
+
|
| 55 |
+
### ⚡ Performance Optimization
|
| 56 |
+
- Implements batching and multi-threading for efficient processing of large document sets
|
| 57 |
+
- Utilizes GPU acceleration where available
|
| 58 |
+
|
| 59 |
+
### 🎛️ Customizable Retrieval
|
| 60 |
+
- Adjustable similarity thresholds for image retrieval
|
| 61 |
+
- Configurable number of top-k results for both text and image queries
|
| 62 |
+
|
| 63 |
+
### 📊 Comprehensive Visualization
|
| 64 |
+
- Displays query results with both textual information and related images
|
| 65 |
+
- Provides a gallery view of all extracted images from the knowledge base
|
| 66 |
+
|
| 67 |
+
### 🔐 Environment Management
|
| 68 |
+
- Uses .env file for secure API key management
|
| 69 |
+
- Supports both CPU and GPU environments
|
| 70 |
+
|
| 71 |
+
### DEMO SCREENSHOT
|
| 72 |
+

|
| 73 |
+
|
| 74 |
+
## 🎥 Video Demonstration
|
| 75 |
+
|
| 76 |
+
Explore the capabilities of our project with our detailed [YouTube video](https://youtu.be/nKCKUcnQ390).
|
| 77 |
+
|
| 78 |
+
## Installation
|
| 79 |
+
|
| 80 |
+
To get started with this project, follow these steps:
|
| 81 |
+
|
| 82 |
+
1. **Install Dependencies**:
|
| 83 |
+
```bash
|
| 84 |
+
pip install -r requirements.txt
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
2. **Set up Qdrant**:
|
| 88 |
+
- Follow the [Qdrant Installation Guide](https://qdrant.tech/documentation/quick_start/) to install and configure Qdrant.
|
| 89 |
+
|
| 90 |
+
3. **Configure the Application**:
|
| 91 |
+
- Ensure configuration files for BioLLM, ClinicalBert, Langchain, and Llama CPP are correctly set up.
|
| 92 |
+
|
| 93 |
+
4. **Run the Application**:
|
| 94 |
+
if you want to run the text reterival application in Flask mode
|
| 95 |
+
```bash
|
| 96 |
+
uvicorn app:app
|
| 97 |
+
```
|
| 98 |
+
if you want to run the text reterival application through Streamlit
|
| 99 |
+
``bash
|
| 100 |
+
streamlit run Streaming.py
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
if you want to run the multimodal application run it through Gradio Interface
|
| 104 |
+
```bash
|
| 105 |
+
python combinedmultimodal.py
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
## 💡 Usage
|
| 109 |
+
|
| 110 |
+
- **Querying the System**: Input medical queries via the application's interface for detailed information retrieval.
|
| 111 |
+
- **Text Generation**: Utilize BioLLM 8B to generate comprehensive medical responses.
|
| 112 |
+
|
| 113 |
+
## 👥 Contributing
|
| 114 |
+
|
| 115 |
+
We welcome contributions to enhance this project! Here's how you can contribute:
|
| 116 |
+
|
| 117 |
+
1. Fork the repository.
|
| 118 |
+
2. Create a new branch (`git checkout -b feature-name`).
|
| 119 |
+
3. Commit your changes (`git commit -am 'Add feature'`).
|
| 120 |
+
4. Push to the branch (`git push origin feature-name`).
|
| 121 |
+
5. Open a Pull Request with detailed information about your changes.
|
| 122 |
+
|
| 123 |
+
## 📜 License
|
| 124 |
+
|
| 125 |
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
| 126 |
+
|
| 127 |
+
## 📞 Contact
|
| 128 |
+
|
| 129 |
+
For questions or suggestions, please open an issue or contact the repository owner at [[email protected]](mailto:[email protected]).
|
Streaming.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
+
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader, PDFMinerLoader
|
| 4 |
+
from langchain_community.vectorstores import Qdrant
|
| 5 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
| 6 |
+
from langchain_community.retrievers import BM25Retriever
|
| 7 |
+
from qdrant_client import QdrantClient
|
| 8 |
+
from qdrant_client.http.exceptions import ResponseHandlingException
|
| 9 |
+
from glob import glob
|
| 10 |
+
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
| 11 |
+
from langchain.chains import RetrievalQA
|
| 12 |
+
from transformers import AutoTokenizer, AutoModel
|
| 13 |
+
from sentence_transformers import models, SentenceTransformer
|
| 14 |
+
from langchain.embeddings.base import Embeddings
|
| 15 |
+
from qdrant_client.models import VectorParams
|
| 16 |
+
import torch
|
| 17 |
+
import base64
|
| 18 |
+
from langchain_community.llms import LlamaCpp
|
| 19 |
+
from langchain_core.prompts import PromptTemplate
|
| 20 |
+
from huggingface_hub import hf_hub_download
|
| 21 |
+
from tempfile import NamedTemporaryFile
|
| 22 |
+
from langchain.retrievers import EnsembleRetriever
|
| 23 |
+
|
| 24 |
+
# Set page configuration
|
| 25 |
+
st.set_page_config(layout="wide")
|
| 26 |
+
st.markdown("""
|
| 27 |
+
<meta http-equiv="Content-Security-Policy"
|
| 28 |
+
content="default-src 'self'; object-src 'self'; frame-src 'self' data:;
|
| 29 |
+
script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
|
| 30 |
+
""", unsafe_allow_html=True)
|
| 31 |
+
# Streamlit secrets
|
| 32 |
+
qdrant_url = st.secrets["QDRANT_URL"]
|
| 33 |
+
qdrant_api_key = st.secrets["QDRANT_API_KEY"]
|
| 34 |
+
|
| 35 |
+
# For debugging only - remove or comment out these lines after verification
|
| 36 |
+
#st.write(f"QDRANT_URL: {qdrant_url}")
|
| 37 |
+
#st.write(f"QDRANT_API_KEY: {qdrant_api_key}")
|
| 38 |
+
|
| 39 |
+
class ClinicalBertEmbeddings(Embeddings):
|
| 40 |
+
def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
|
| 41 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 42 |
+
self.model = AutoModel.from_pretrained(model_name)
|
| 43 |
+
self.model.eval()
|
| 44 |
+
|
| 45 |
+
def embed(self, text: str):
|
| 46 |
+
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
| 47 |
+
with torch.no_grad():
|
| 48 |
+
outputs = self.model(**inputs)
|
| 49 |
+
embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
|
| 50 |
+
return embeddings.squeeze().numpy()
|
| 51 |
+
|
| 52 |
+
def mean_pooling(self, model_output, attention_mask):
|
| 53 |
+
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
|
| 54 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 55 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 56 |
+
|
| 57 |
+
def embed_documents(self, texts):
|
| 58 |
+
return [self.embed(text) for text in texts]
|
| 59 |
+
|
| 60 |
+
def embed_query(self, text):
|
| 61 |
+
return self.embed(text)
|
| 62 |
+
|
| 63 |
+
@st.cache_resource
|
| 64 |
+
def load_model():
|
| 65 |
+
model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
|
| 66 |
+
model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
| 67 |
+
model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
|
| 68 |
+
return LlamaCpp(
|
| 69 |
+
model_path=model_path,
|
| 70 |
+
temperature=0.3,
|
| 71 |
+
n_ctx=2048,
|
| 72 |
+
top_p=1
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Initialize embeddings
|
| 76 |
+
@st.cache_resource
|
| 77 |
+
def load_embeddings():
|
| 78 |
+
return ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
|
| 79 |
+
|
| 80 |
+
# Initialize database
|
| 81 |
+
@st.cache_resource
|
| 82 |
+
def setup_qdrant():
|
| 83 |
+
try:
|
| 84 |
+
if not qdrant_url or not qdrant_api_key:
|
| 85 |
+
raise ValueError("QDRANT_URL or QDRANT_API_KEY not set in environment variables.")
|
| 86 |
+
|
| 87 |
+
# Initialize Qdrant client
|
| 88 |
+
client = QdrantClient(
|
| 89 |
+
url=qdrant_url,
|
| 90 |
+
api_key=qdrant_api_key,
|
| 91 |
+
port=443, # Assuming HTTPS should use port 443
|
| 92 |
+
)
|
| 93 |
+
st.write("Qdrant client initialized successfully.")
|
| 94 |
+
|
| 95 |
+
# Create or recreate collection
|
| 96 |
+
collection_name = "vector_db"
|
| 97 |
+
try:
|
| 98 |
+
collection_info = client.get_collection(collection_name=collection_name)
|
| 99 |
+
st.write(f"Collection '{collection_name}' already exists.")
|
| 100 |
+
except ResponseHandlingException:
|
| 101 |
+
st.write(f"Collection '{collection_name}' does not exist. Creating a new one.")
|
| 102 |
+
client.recreate_collection(
|
| 103 |
+
collection_name=collection_name,
|
| 104 |
+
vectors_config=VectorParams(size=768, distance="Cosine")
|
| 105 |
+
)
|
| 106 |
+
st.write(f"Collection '{collection_name}' created successfully.")
|
| 107 |
+
|
| 108 |
+
embeddings = load_embeddings()
|
| 109 |
+
st.write("Embeddings model loaded successfully.")
|
| 110 |
+
|
| 111 |
+
return Qdrant(client=client, embeddings=embeddings, collection_name=collection_name)
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
st.error(f"Failed to initialize Qdrant: {e}")
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
# Initialize database
|
| 118 |
+
db = setup_qdrant()
|
| 119 |
+
|
| 120 |
+
if db is None:
|
| 121 |
+
st.error("Qdrant setup failed, exiting.")
|
| 122 |
+
else:
|
| 123 |
+
st.success("Qdrant setup successful.")
|
| 124 |
+
|
| 125 |
+
# Load models
|
| 126 |
+
llm = load_model()
|
| 127 |
+
embeddings = load_embeddings()
|
| 128 |
+
|
| 129 |
+
# Define prompt template
|
| 130 |
+
prompt_template = """Use the following pieces of information to answer the user's question.
|
| 131 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
| 132 |
+
|
| 133 |
+
Context: {context}
|
| 134 |
+
Question: {question}
|
| 135 |
+
|
| 136 |
+
Only return the helpful answer. Answer must be detailed and well explained.
|
| 137 |
+
Helpful answer:
|
| 138 |
+
"""
|
| 139 |
+
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
|
| 140 |
+
# Define retriever
|
| 141 |
+
|
| 142 |
+
# Define Streamlit app
|
| 143 |
+
|
| 144 |
+
def process_answer(query):
|
| 145 |
+
chain_type_kwargs = {"prompt": prompt}
|
| 146 |
+
global ensemble_retriever
|
| 147 |
+
qa = RetrievalQA.from_chain_type(
|
| 148 |
+
llm=llm,
|
| 149 |
+
chain_type="stuff",
|
| 150 |
+
retriever=ensemble_retriever,
|
| 151 |
+
return_source_documents=True,
|
| 152 |
+
chain_type_kwargs=chain_type_kwargs,
|
| 153 |
+
verbose=True
|
| 154 |
+
)
|
| 155 |
+
response = qa(query)
|
| 156 |
+
answer = response['result']
|
| 157 |
+
source_document = response['source_documents'][0].page_content
|
| 158 |
+
doc = response['source_documents'][0].metadata['source']
|
| 159 |
+
return answer, source_document, doc
|
| 160 |
+
|
| 161 |
+
def display_pdf(file):
|
| 162 |
+
with open(file, "rb") as f:
|
| 163 |
+
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
| 164 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
| 165 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
| 166 |
+
|
| 167 |
+
def main():
|
| 168 |
+
st.title("PDF Question Answering System")
|
| 169 |
+
|
| 170 |
+
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
| 171 |
+
|
| 172 |
+
if uploaded_file is not None:
|
| 173 |
+
# Save uploaded PDF
|
| 174 |
+
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
| 175 |
+
temp_file.write(uploaded_file.read())
|
| 176 |
+
temp_file_path = temp_file.name
|
| 177 |
+
|
| 178 |
+
# Display PDF
|
| 179 |
+
st.subheader("PDF Preview")
|
| 180 |
+
display_pdf(temp_file_path)
|
| 181 |
+
|
| 182 |
+
# Load and process PDF
|
| 183 |
+
loader = PDFMinerLoader(temp_file_path)
|
| 184 |
+
documents = loader.load()
|
| 185 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
| 186 |
+
texts = text_splitter.split_documents(documents)
|
| 187 |
+
|
| 188 |
+
# Update the Qdrant database with the new PDF content
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
db.add_documents(texts)
|
| 192 |
+
st.success("PDF processed and vector database updated!")
|
| 193 |
+
global ensemble_retriever
|
| 194 |
+
# Initialize retriever after documents are added
|
| 195 |
+
bm25_retriever = BM25Retriever.from_documents(documents=texts)
|
| 196 |
+
bm25_retriever.k = 3
|
| 197 |
+
qdrant_retriever = db.as_retriever(search_kwargs={"k":1})
|
| 198 |
+
# Combine both retrievers using EnsembleRetriever
|
| 199 |
+
ensemble_retriever = EnsembleRetriever(
|
| 200 |
+
retrievers=[qdrant_retriever, bm25_retriever],
|
| 201 |
+
weights=[0.5, 0.5] # Adjust weights based on desired contribution
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
st.error(f"Error updating database: {e}")
|
| 206 |
+
|
| 207 |
+
st.subheader("Ask a question about the PDF")
|
| 208 |
+
user_input = st.text_input("Your question:")
|
| 209 |
+
|
| 210 |
+
if st.button('Get Response'):
|
| 211 |
+
if user_input:
|
| 212 |
+
try:
|
| 213 |
+
answer, source_document, doc = process_answer(user_input)
|
| 214 |
+
st.write("*Answer:*", answer)
|
| 215 |
+
st.write("*Source Document:*", source_document)
|
| 216 |
+
st.write("*Document Source:*", doc)
|
| 217 |
+
except Exception as e:
|
| 218 |
+
st.error(f"Error processing query: {e}")
|
| 219 |
+
else:
|
| 220 |
+
st.warning("Please enter a query.")
|
| 221 |
+
|
| 222 |
+
if __name__ == "__main__":
|
| 223 |
+
main()
|
Streamingnewversion.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
+
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader, PDFMinerLoader
|
| 4 |
+
from langchain_community.vectorstores import Qdrant
|
| 5 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
| 6 |
+
from langchain_community.retrievers import BM25Retriever
|
| 7 |
+
from qdrant_client import QdrantClient
|
| 8 |
+
from qdrant_client.http.exceptions import ResponseHandlingException
|
| 9 |
+
from glob import glob
|
| 10 |
+
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
| 11 |
+
from langchain.chains import RetrievalQA
|
| 12 |
+
from transformers import AutoTokenizer, AutoModel
|
| 13 |
+
from sentence_transformers import models, SentenceTransformer
|
| 14 |
+
from langchain.embeddings.base import Embeddings
|
| 15 |
+
from qdrant_client.models import VectorParams
|
| 16 |
+
import torch
|
| 17 |
+
import base64
|
| 18 |
+
from langchain_community.llms import LlamaCpp
|
| 19 |
+
from langchain_core.prompts import PromptTemplate
|
| 20 |
+
from huggingface_hub import hf_hub_download
|
| 21 |
+
from tempfile import NamedTemporaryFile
|
| 22 |
+
from langchain.retrievers import EnsembleRetriever
|
| 23 |
+
import urllib
|
| 24 |
+
import nltk
|
| 25 |
+
import os
|
| 26 |
+
# Add this at the beginning of your script
|
| 27 |
+
import logging
|
| 28 |
+
logging.basicConfig(level=logging.DEBUG)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# Define the path for NLTK data
|
| 32 |
+
nltk_data_path = '/tmp/nltk_data'
|
| 33 |
+
os.makedirs(nltk_data_path, exist_ok=True)
|
| 34 |
+
|
| 35 |
+
# Set NLTK data path environment variable
|
| 36 |
+
nltk.data.path.append(nltk_data_path)
|
| 37 |
+
|
| 38 |
+
# Download required NLTK data
|
| 39 |
+
try:
|
| 40 |
+
nltk.data.find('tokenizers/punkt')
|
| 41 |
+
except LookupError:
|
| 42 |
+
nltk.download('punkt', download_dir=nltk_data_path)
|
| 43 |
+
|
| 44 |
+
# Set page configuration
|
| 45 |
+
st.set_page_config(layout="wide")
|
| 46 |
+
st.markdown("""
|
| 47 |
+
<meta http-equiv="Content-Security-Policy"
|
| 48 |
+
content="default-src 'self'; object-src 'self'; frame-src 'self' data:;
|
| 49 |
+
script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';">
|
| 50 |
+
""", unsafe_allow_html=True)
|
| 51 |
+
# Streamlit secrets
|
| 52 |
+
qdrant_url = st.secrets["QDRANT_URL"]
|
| 53 |
+
qdrant_api_key = st.secrets["QDRANT_API_KEY"]
|
| 54 |
+
|
| 55 |
+
# For debugging only - remove or comment out these lines after verification
|
| 56 |
+
#st.write(f"QDRANT_URL: {qdrant_url}")
|
| 57 |
+
#st.write(f"QDRANT_API_KEY: {qdrant_api_key}")
|
| 58 |
+
|
| 59 |
+
class ClinicalBertEmbeddings(Embeddings):
|
| 60 |
+
def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
|
| 61 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 62 |
+
self.model = AutoModel.from_pretrained(model_name)
|
| 63 |
+
self.model.eval()
|
| 64 |
+
|
| 65 |
+
def embed(self, text: str):
|
| 66 |
+
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
| 67 |
+
with torch.no_grad():
|
| 68 |
+
outputs = self.model(**inputs)
|
| 69 |
+
embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
|
| 70 |
+
return embeddings.squeeze().numpy()
|
| 71 |
+
|
| 72 |
+
def mean_pooling(self, model_output, attention_mask):
|
| 73 |
+
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
|
| 74 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 75 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 76 |
+
|
| 77 |
+
def embed_documents(self, texts):
|
| 78 |
+
return [self.embed(text) for text in texts]
|
| 79 |
+
|
| 80 |
+
def embed_query(self, text):
|
| 81 |
+
return self.embed(text)
|
| 82 |
+
|
| 83 |
+
@st.cache_resource
|
| 84 |
+
def load_model():
|
| 85 |
+
model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
|
| 86 |
+
model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
| 87 |
+
model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
|
| 88 |
+
return LlamaCpp(
|
| 89 |
+
model_path=model_path,
|
| 90 |
+
temperature=0.3,
|
| 91 |
+
n_ctx=2048,
|
| 92 |
+
top_p=1
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# Initialize embeddings
|
| 96 |
+
@st.cache_resource
|
| 97 |
+
def load_embeddings():
|
| 98 |
+
return ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
|
| 99 |
+
|
| 100 |
+
# Initialize database
|
| 101 |
+
@st.cache_resource
|
| 102 |
+
def setup_qdrant():
|
| 103 |
+
try:
|
| 104 |
+
if not qdrant_url or not qdrant_api_key:
|
| 105 |
+
raise ValueError("QDRANT_URL or QDRANT_API_KEY not set in environment variables.")
|
| 106 |
+
|
| 107 |
+
# Initialize Qdrant client
|
| 108 |
+
client = QdrantClient(
|
| 109 |
+
url=qdrant_url,
|
| 110 |
+
api_key=qdrant_api_key,
|
| 111 |
+
port=443, # Assuming HTTPS should use port 443
|
| 112 |
+
)
|
| 113 |
+
st.write("Qdrant client initialized successfully.")
|
| 114 |
+
|
| 115 |
+
# Create or recreate collection
|
| 116 |
+
collection_name = "vector_db"
|
| 117 |
+
try:
|
| 118 |
+
collection_info = client.get_collection(collection_name=collection_name)
|
| 119 |
+
st.write(f"Collection '{collection_name}' already exists.")
|
| 120 |
+
except ResponseHandlingException:
|
| 121 |
+
st.write(f"Collection '{collection_name}' does not exist. Creating a new one.")
|
| 122 |
+
client.recreate_collection(
|
| 123 |
+
collection_name=collection_name,
|
| 124 |
+
vectors_config=VectorParams(size=768, distance="Cosine")
|
| 125 |
+
)
|
| 126 |
+
st.write(f"Collection '{collection_name}' created successfully.")
|
| 127 |
+
|
| 128 |
+
embeddings = load_embeddings()
|
| 129 |
+
st.write("Embeddings model loaded successfully.")
|
| 130 |
+
|
| 131 |
+
return Qdrant(client=client, embeddings=embeddings, collection_name=collection_name)
|
| 132 |
+
|
| 133 |
+
except Exception as e:
|
| 134 |
+
st.error(f"Failed to initialize Qdrant: {e}")
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
# Initialize database
|
| 138 |
+
db = setup_qdrant()
|
| 139 |
+
|
| 140 |
+
if db is None:
|
| 141 |
+
st.error("Qdrant setup failed, exiting.")
|
| 142 |
+
else:
|
| 143 |
+
st.success("Qdrant setup successful.")
|
| 144 |
+
|
| 145 |
+
# Load models
|
| 146 |
+
llm = load_model()
|
| 147 |
+
embeddings = load_embeddings()
|
| 148 |
+
|
| 149 |
+
# Define prompt template
|
| 150 |
+
prompt_template = """Use the following pieces of information to answer the user's question.
|
| 151 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
| 152 |
+
|
| 153 |
+
Context: {context}
|
| 154 |
+
Question: {question}
|
| 155 |
+
|
| 156 |
+
Only return the helpful answer. Answer must be detailed and well explained.
|
| 157 |
+
Helpful answer:
|
| 158 |
+
"""
|
| 159 |
+
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
|
| 160 |
+
# Define retriever
|
| 161 |
+
|
| 162 |
+
# Define Streamlit app
|
| 163 |
+
|
| 164 |
+
def process_answer(query):
|
| 165 |
+
chain_type_kwargs = {"prompt": prompt}
|
| 166 |
+
global ensemble_retriever
|
| 167 |
+
qa = RetrievalQA.from_chain_type(
|
| 168 |
+
llm=llm,
|
| 169 |
+
chain_type="stuff",
|
| 170 |
+
retriever=ensemble_retriever,
|
| 171 |
+
return_source_documents=True,
|
| 172 |
+
chain_type_kwargs=chain_type_kwargs,
|
| 173 |
+
verbose=True
|
| 174 |
+
)
|
| 175 |
+
response = qa(query)
|
| 176 |
+
answer = response['result']
|
| 177 |
+
source_document = response['source_documents'][0].page_content
|
| 178 |
+
doc = response['source_documents'][0].metadata['source']
|
| 179 |
+
return answer, source_document, doc
|
| 180 |
+
|
| 181 |
+
def display_pdf(file):
|
| 182 |
+
with open(file, "rb") as f:
|
| 183 |
+
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
| 184 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600" type="application/pdf"></iframe>'
|
| 185 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
| 186 |
+
|
| 187 |
+
def main():
|
| 188 |
+
st.title("PDF Question Answering System")
|
| 189 |
+
|
| 190 |
+
# Displaying File
|
| 191 |
+
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
| 192 |
+
|
| 193 |
+
if uploaded_file is not None:
|
| 194 |
+
# Save uploaded PDF
|
| 195 |
+
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
| 196 |
+
temp_file.write(uploaded_file.read())
|
| 197 |
+
temp_file_path = temp_file.name
|
| 198 |
+
|
| 199 |
+
# Display PDF
|
| 200 |
+
st.subheader("PDF Preview")
|
| 201 |
+
display_pdf(temp_file_path)
|
| 202 |
+
|
| 203 |
+
# Load and process PDF
|
| 204 |
+
loader = PDFMinerLoader(temp_file_path)
|
| 205 |
+
documents = loader.load()
|
| 206 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
| 207 |
+
texts = text_splitter.split_documents(documents)
|
| 208 |
+
|
| 209 |
+
# Update the Qdrant database with the new PDF content
|
| 210 |
+
|
| 211 |
+
try:
|
| 212 |
+
db.add_documents(texts)
|
| 213 |
+
st.success("PDF processed and vector database updated!")
|
| 214 |
+
global ensemble_retriever
|
| 215 |
+
# Initialize retriever after documents are added
|
| 216 |
+
bm25_retriever = BM25Retriever.from_documents(documents=texts)
|
| 217 |
+
bm25_retriever.k = 3
|
| 218 |
+
qdrant_retriever = db.as_retriever(search_kwargs={"k":1})
|
| 219 |
+
# Combine both retrievers using EnsembleRetriever
|
| 220 |
+
ensemble_retriever = EnsembleRetriever(
|
| 221 |
+
retrievers=[qdrant_retriever, bm25_retriever],
|
| 222 |
+
weights=[0.5, 0.5] # Adjust weights based on desired contribution
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
except Exception as e:
|
| 226 |
+
st.error(f"Error updating database: {e}")
|
| 227 |
+
|
| 228 |
+
st.subheader("Ask a question about the PDF")
|
| 229 |
+
user_input = st.text_input("Your question:")
|
| 230 |
+
|
| 231 |
+
if st.button('Get Response'):
|
| 232 |
+
if user_input:
|
| 233 |
+
try:
|
| 234 |
+
answer, source_document, doc = process_answer(user_input)
|
| 235 |
+
st.write("*Answer:*", answer)
|
| 236 |
+
st.write("*Source Document:*", source_document)
|
| 237 |
+
st.write("*Document Source:*", doc)
|
| 238 |
+
except Exception as e:
|
| 239 |
+
st.error(f"Error processing query: {e}")
|
| 240 |
+
else:
|
| 241 |
+
st.warning("Please enter a query.")
|
| 242 |
+
|
| 243 |
+
if __name__ == "__main__":
|
| 244 |
+
main()
|
__pycache__/app.cpython-310.pyc
ADDED
|
Binary file (2.97 kB). View file
|
|
|
__pycache__/clip_helpers.cpython-310.pyc
ADDED
|
Binary file (644 Bytes). View file
|
|
|
__pycache__/combinedmultimodal.cpython-310.pyc
ADDED
|
Binary file (15.3 kB). View file
|
|
|
__pycache__/imagebind.cpython-310.pyc
ADDED
|
Binary file (2.9 kB). View file
|
|
|
__pycache__/images.cpython-310.pyc
ADDED
|
Binary file (543 Bytes). View file
|
|
|
__pycache__/ingest.cpython-310.pyc
ADDED
|
Binary file (3.68 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain import PromptTemplate
|
| 2 |
+
from langchain_community.llms import LlamaCpp
|
| 3 |
+
from langchain.chains import RetrievalQA
|
| 4 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
| 5 |
+
from fastapi import FastAPI, Request, Form, Response
|
| 6 |
+
from fastapi.responses import HTMLResponse
|
| 7 |
+
from fastapi.templating import Jinja2Templates
|
| 8 |
+
from fastapi.staticfiles import StaticFiles
|
| 9 |
+
from fastapi.encoders import jsonable_encoder
|
| 10 |
+
from qdrant_client import QdrantClient
|
| 11 |
+
from langchain_community.vectorstores import Qdrant
|
| 12 |
+
import os
|
| 13 |
+
import json
|
| 14 |
+
from huggingface_hub import hf_hub_download
|
| 15 |
+
from langchain.retrievers import EnsembleRetriever
|
| 16 |
+
from ingest import keyword_retriever
|
| 17 |
+
|
| 18 |
+
app = FastAPI()
|
| 19 |
+
|
| 20 |
+
templates = Jinja2Templates(directory="templates")
|
| 21 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 22 |
+
model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
|
| 23 |
+
model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
| 24 |
+
|
| 25 |
+
model_path = hf_hub_download(model_name,
|
| 26 |
+
filename=model_file, local_dir='./')
|
| 27 |
+
|
| 28 |
+
local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
| 29 |
+
|
| 30 |
+
# Make sure the model path is correct for your system!
|
| 31 |
+
llm = LlamaCpp(
|
| 32 |
+
model_path= local_llm,
|
| 33 |
+
temperature=0.3,
|
| 34 |
+
# max_tokens=2048,
|
| 35 |
+
n_ctx=2048,
|
| 36 |
+
top_p=1
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
print("LLM Initialized....")
|
| 40 |
+
|
| 41 |
+
prompt_template = """Use the following pieces of information to answer the user's question.
|
| 42 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
| 43 |
+
|
| 44 |
+
Context: {context}
|
| 45 |
+
Question: {question}
|
| 46 |
+
|
| 47 |
+
Only return the helpful answer. Answer must be detailed and well explained.
|
| 48 |
+
Helpful answer:
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
embeddings = SentenceTransformerEmbeddings(model_name="medicalai/ClinicalBERT")
|
| 52 |
+
|
| 53 |
+
url = "http://localhost:6333"
|
| 54 |
+
|
| 55 |
+
client = QdrantClient(
|
| 56 |
+
url=url, prefer_grpc=False
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
db = Qdrant(client=client, embeddings=embeddings, collection_name="vector_db")
|
| 60 |
+
|
| 61 |
+
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
|
| 62 |
+
|
| 63 |
+
retriever = db.as_retriever(search_kwargs={"k":1})
|
| 64 |
+
ensemble_retriever = EnsembleRetriever(retrievers=[retriever,
|
| 65 |
+
keyword_retriever],
|
| 66 |
+
weights=[0.5, 0.5])
|
| 67 |
+
@app.get("/", response_class=HTMLResponse)
|
| 68 |
+
async def read_root(request: Request):
|
| 69 |
+
return templates.TemplateResponse("index.html", {"request": request})
|
| 70 |
+
|
| 71 |
+
@app.post("/get_response")
|
| 72 |
+
async def get_response(query: str = Form(...)):
|
| 73 |
+
chain_type_kwargs = {"prompt": prompt}
|
| 74 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=ensemble_retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs, verbose=True)
|
| 75 |
+
response = qa(query)
|
| 76 |
+
print(response)
|
| 77 |
+
answer = response['result']
|
| 78 |
+
source_document = response['source_documents'][0].page_content
|
| 79 |
+
doc = response['source_documents'][0].metadata['source']
|
| 80 |
+
response_data = jsonable_encoder(json.dumps({"answer": answer, "source_document": source_document, "doc": doc}))
|
| 81 |
+
|
| 82 |
+
res = Response(response_data)
|
| 83 |
+
return res
|
app1.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from langchain import PromptTemplate
|
| 3 |
+
from langchain_community.llms import LlamaCpp
|
| 4 |
+
from langchain.chains import RetrievalQA
|
| 5 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
| 6 |
+
from qdrant_client import QdrantClient
|
| 7 |
+
from langchain_community.vectorstores import Qdrant
|
| 8 |
+
import os
|
| 9 |
+
import json
|
| 10 |
+
from huggingface_hub import hf_hub_download
|
| 11 |
+
from langchain.retrievers import EnsembleRetriever
|
| 12 |
+
# from ingest import ClinicalBertEmbeddings, keyword_retriever
|
| 13 |
+
from langchain_community.llms import CTransformers
|
| 14 |
+
from transformers import AutoTokenizer, AutoModel
|
| 15 |
+
# # Initialize Streamlit app
|
| 16 |
+
# st.set_page_config(page_title="Document Retrieval App", layout='wide')
|
| 17 |
+
|
| 18 |
+
# # Download and initialize LLM model
|
| 19 |
+
# MODEL_PATH = './'
|
| 20 |
+
|
| 21 |
+
# # Some basic configurations for the model
|
| 22 |
+
# config = {
|
| 23 |
+
# "max_new_tokens": 2048,
|
| 24 |
+
# "context_length": 4096,
|
| 25 |
+
# "repetition_penalty": 1.1,
|
| 26 |
+
# "temperature": 0.5,
|
| 27 |
+
# "top_k": 50,
|
| 28 |
+
# "top_p": 0.9,
|
| 29 |
+
# "stream": True,
|
| 30 |
+
# "threads": int(os.cpu_count() / 2)
|
| 31 |
+
# }
|
| 32 |
+
|
| 33 |
+
# # We use Langchain's CTransformers llm class to load our quantized model
|
| 34 |
+
# llm = CTransformers(model=MODEL_PATH,
|
| 35 |
+
# config=config)
|
| 36 |
+
|
| 37 |
+
# # Tokenizer for Mistral-7B-Instruct from HuggingFace
|
| 38 |
+
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
|
| 39 |
+
# model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
|
| 40 |
+
# model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
| 41 |
+
# model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
|
| 42 |
+
|
| 43 |
+
# local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
| 44 |
+
# llm = LlamaCpp(
|
| 45 |
+
# model_path=local_llm,
|
| 46 |
+
# temperature=0.3,
|
| 47 |
+
# n_ctx=2048,
|
| 48 |
+
# top_p=1
|
| 49 |
+
# )
|
| 50 |
+
|
| 51 |
+
# st.sidebar.title("Document Retrieval App")
|
| 52 |
+
|
| 53 |
+
# # Initialize embeddings
|
| 54 |
+
# embeddings = ClinicalBertEmbeddings()
|
| 55 |
+
|
| 56 |
+
# # Qdrant setup for medical_image collection
|
| 57 |
+
# url = "http://localhost:6333"
|
| 58 |
+
# client_medical = QdrantClient(url=url, prefer_grpc=False)
|
| 59 |
+
# db_medical = Qdrant(client=client_medical, embeddings=embeddings, collection_name="medical_image")
|
| 60 |
+
|
| 61 |
+
# # Qdrant setup for pdf collection
|
| 62 |
+
# client_pdf = QdrantClient(url=url, prefer_grpc=False)
|
| 63 |
+
# db_pdf = Qdrant(client=client_pdf, embeddings=embeddings, collection_name="pdf")
|
| 64 |
+
|
| 65 |
+
# # Define retrievers for both collections
|
| 66 |
+
# retriever_medical = db_medical.as_retriever(search_kwargs={"k": 1})
|
| 67 |
+
# retriever_pdf = db_pdf.as_retriever(search_kwargs={"k": 1})
|
| 68 |
+
|
| 69 |
+
# # Ensemble retriever combining both retrievers
|
| 70 |
+
# ensemble_retriever = EnsembleRetriever(retrievers=[retriever_medical, retriever_pdf], weights=[0.5, 0.5])
|
| 71 |
+
|
| 72 |
+
# # Prompt template for querying
|
| 73 |
+
# prompt_template = """Use the following pieces of information to answer the user's question.
|
| 74 |
+
# If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
| 75 |
+
|
| 76 |
+
# Context: {context}
|
| 77 |
+
# Question: {question}
|
| 78 |
+
|
| 79 |
+
# Only return the helpful answer. Answer must be detailed and well explained.
|
| 80 |
+
# Helpful answer:
|
| 81 |
+
# """
|
| 82 |
+
# prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
|
| 83 |
+
|
| 84 |
+
# # Streamlit app layout
|
| 85 |
+
# with st.sidebar:
|
| 86 |
+
# query = st.text_area("Enter your query here:")
|
| 87 |
+
# if st.button("Get Response"):
|
| 88 |
+
# st.write("Processing query...")
|
| 89 |
+
# chain_type_kwargs = {"prompt": prompt}
|
| 90 |
+
# qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=ensemble_retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs, verbose=True)
|
| 91 |
+
# response = qa(query)
|
| 92 |
+
|
| 93 |
+
# # Process response to extract answer, source document, and metadata
|
| 94 |
+
# answer = response['result']
|
| 95 |
+
# source_document = response['source_documents'][0].page_content
|
| 96 |
+
# doc = response['source_documents'][0].metadata['source']
|
| 97 |
+
|
| 98 |
+
# # Display response
|
| 99 |
+
# st.subheader("Answer:")
|
| 100 |
+
# st.write(answer)
|
| 101 |
+
# st.subheader("Source Document:")
|
| 102 |
+
# st.write(source_document)
|
| 103 |
+
# st.subheader("Document Metadata:")
|
| 104 |
+
# st.write(doc)
|
| 105 |
+
|
| 106 |
+
# # Run the app
|
| 107 |
+
# if __name__ == '__main__':
|
| 108 |
+
# st.title("Document Retrieval App")
|
| 109 |
+
# st.write("Enter your query in the sidebar and click 'Get Response' to retrieve relevant documents.")
|
| 110 |
+
# Define model and prompt template
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# Set your Hugging Face API token
|
| 114 |
+
os.environ['HUGGINGFACE_HUB_TOKEN'] = ''
|
| 115 |
+
|
| 116 |
+
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
|
| 117 |
+
model_file = "mistral-7b-instruct.q4_0.bin"
|
| 118 |
+
|
| 119 |
+
model_path = hf_hub_download(model_name, filename=model_file, local_dir='./', use_auth_token='HUGGINGFACE_HUB_TOKEN')
|
combinedmultimodal.py
ADDED
|
@@ -0,0 +1,621 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uuid
|
| 3 |
+
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
| 4 |
+
from llama_index.core import VectorStoreIndex, StorageContext
|
| 5 |
+
import qdrant_client
|
| 6 |
+
import torch
|
| 7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
+
import clip
|
| 9 |
+
from llama_index.core import Document
|
| 10 |
+
from langchain_community.llms import LlamaCpp
|
| 11 |
+
import numpy as np
|
| 12 |
+
from huggingface_hub import hf_hub_download
|
| 13 |
+
from langchain_community.llms import LlamaCpp
|
| 14 |
+
from llama_index.core import (
|
| 15 |
+
ServiceContext,
|
| 16 |
+
SimpleDirectoryReader,
|
| 17 |
+
)
|
| 18 |
+
import threading
|
| 19 |
+
from dotenv import load_dotenv
|
| 20 |
+
from llama_index.llms.nvidia import NVIDIA
|
| 21 |
+
from open_clip import create_model_from_pretrained, get_tokenizer
|
| 22 |
+
from llama_index.core import Settings
|
| 23 |
+
from llama_index.core import VectorStoreIndex
|
| 24 |
+
from llama_index.core.vector_stores import VectorStoreQuery
|
| 25 |
+
from llama_index.core.query_engine import RetrieverQueryEngine
|
| 26 |
+
from tqdm import tqdm
|
| 27 |
+
from transformers import AutoTokenizer, AutoModel
|
| 28 |
+
from langchain.embeddings.base import Embeddings
|
| 29 |
+
from llama_index.embeddings.langchain import LangchainEmbedding
|
| 30 |
+
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
| 31 |
+
from llama_index.core import Settings
|
| 32 |
+
from transformers import AutoProcessor, AutoModel
|
| 33 |
+
import hashlib
|
| 34 |
+
import uuid
|
| 35 |
+
import os
|
| 36 |
+
import gradio as gr
|
| 37 |
+
import torch
|
| 38 |
+
import clip
|
| 39 |
+
import open_clip
|
| 40 |
+
import numpy as np
|
| 41 |
+
from llama_index.core.schema import ImageDocument
|
| 42 |
+
import cv2
|
| 43 |
+
import matplotlib.pyplot as plt
|
| 44 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 45 |
+
from unstructured.partition.pdf import partition_pdf
|
| 46 |
+
from pathlib import Path
|
| 47 |
+
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
|
| 48 |
+
from PIL import Image
|
| 49 |
+
import logging
|
| 50 |
+
import concurrent.futures
|
| 51 |
+
import logging
|
| 52 |
+
from llama_index.core import set_global_service_context
|
| 53 |
+
from llama_index.core import Document as LlamaIndexDocument
|
| 54 |
+
import getpass
|
| 55 |
+
import os
|
| 56 |
+
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
|
| 57 |
+
from sentence_transformers import util
|
| 58 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 59 |
+
import base64
|
| 60 |
+
from google.generativeai import GenerativeModel, configure
|
| 61 |
+
import google.generativeai as genai
|
| 62 |
+
|
| 63 |
+
# Configure logging
|
| 64 |
+
# logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class MetadataMode:
|
| 68 |
+
EMBED = "embed"
|
| 69 |
+
INLINE = "inline"
|
| 70 |
+
NONE = "none"
|
| 71 |
+
|
| 72 |
+
# Define the vectors configuration
|
| 73 |
+
vectors_config = {
|
| 74 |
+
"vector_size": 768, # or whatever the dimensionality of your vectors is
|
| 75 |
+
"distance": "Cosine" # can be "Cosine", "Euclidean", etc.
|
| 76 |
+
}
|
| 77 |
+
class ClinicalBertEmbeddingWrapper:
|
| 78 |
+
def __init__(self, model_name: str = "medicalai/ClinicalBERT"):
|
| 79 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 80 |
+
self.model = AutoModel.from_pretrained(model_name)
|
| 81 |
+
self.model.eval()
|
| 82 |
+
|
| 83 |
+
def embed(self, text: str):
|
| 84 |
+
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
| 85 |
+
with torch.no_grad():
|
| 86 |
+
outputs = self.model(**inputs)
|
| 87 |
+
embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
|
| 88 |
+
return embeddings.squeeze().tolist()
|
| 89 |
+
|
| 90 |
+
def mean_pooling(self, model_output, attention_mask):
|
| 91 |
+
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
|
| 92 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 93 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 94 |
+
|
| 95 |
+
def embed_documents(self, texts):
|
| 96 |
+
return [self.embed(text) for text in texts]
|
| 97 |
+
|
| 98 |
+
def embed_query(self, text):
|
| 99 |
+
return self.embed(text)
|
| 100 |
+
# Implement this method if needed
|
| 101 |
+
def get_text_embedding_batch(self, text_batch, show_progress=False):
|
| 102 |
+
embeddings = []
|
| 103 |
+
num_batches = len(text_batch)
|
| 104 |
+
|
| 105 |
+
# Process in batches of size 8
|
| 106 |
+
batch_size = 8
|
| 107 |
+
for i in tqdm(range(0, num_batches, batch_size), desc="Processing Batches", disable=not show_progress):
|
| 108 |
+
batch_texts = text_batch[i:i + batch_size]
|
| 109 |
+
batch_embeddings = self.embed_documents(batch_texts)
|
| 110 |
+
embeddings.extend(batch_embeddings)
|
| 111 |
+
|
| 112 |
+
return embeddings
|
| 113 |
+
def get_agg_embedding_from_queries(self, queries):
|
| 114 |
+
# Get embeddings for each query using the embed method
|
| 115 |
+
embeddings = [torch.tensor(self.embed(query)) for query in queries]
|
| 116 |
+
|
| 117 |
+
# Convert list of tensors to a single tensor for aggregation
|
| 118 |
+
embeddings_tensor = torch.stack(embeddings)
|
| 119 |
+
|
| 120 |
+
# Example: averaging embeddings
|
| 121 |
+
agg_embedding = embeddings_tensor.mean(dim=0)
|
| 122 |
+
|
| 123 |
+
return agg_embedding.tolist()
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# Load environment variables
|
| 127 |
+
load_dotenv()
|
| 128 |
+
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
|
| 129 |
+
nvidia_api_key = os.getenv("NVIDIA_API_KEY")
|
| 130 |
+
if not nvidia_api_key:
|
| 131 |
+
raise ValueError("NVIDIA_API_KEY not found in .env file")
|
| 132 |
+
|
| 133 |
+
os.environ["NVIDIA_API_KEY"] = nvidia_api_key
|
| 134 |
+
|
| 135 |
+
model_name = "aaditya/OpenBioLLM-Llama3-8B-GGUF"
|
| 136 |
+
model_file = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
| 137 |
+
QDRANT_URL = "https://f1e9a70a-afb9-498d-b66d-cb248e0d5557.us-east4-0.gcp.cloud.qdrant.io:6333"
|
| 138 |
+
QDRANT_API_KEY = "REXlX_PeDvCoXeS9uKCzC--e3-LQV0lw3_jBTdcLZ7P5_F6EOdwklA"
|
| 139 |
+
|
| 140 |
+
# Download model
|
| 141 |
+
model_path = hf_hub_download(model_name, filename=model_file, local_dir='./')
|
| 142 |
+
llm = NVIDIA(model="writer/palmyra-med-70b")
|
| 143 |
+
llm.model
|
| 144 |
+
local_llm = "openbiollm-llama3-8b.Q5_K_M.gguf"
|
| 145 |
+
# Initialize ClinicalBert embeddings model
|
| 146 |
+
# text_embed_model = ClinicalBertEmbeddings(model_name="medicalai/ClinicalBERT")
|
| 147 |
+
text_embed_model = ClinicalBertEmbeddingWrapper(model_name="medicalai/ClinicalBERT")
|
| 148 |
+
# Intially I was using this biollm but for faster text response during inference I am going for external models
|
| 149 |
+
#but with this also it works fine.
|
| 150 |
+
llm1 = LlamaCpp(
|
| 151 |
+
model_path=local_llm,
|
| 152 |
+
temperature=0.3,
|
| 153 |
+
n_ctx=2048,
|
| 154 |
+
top_p=1
|
| 155 |
+
)
|
| 156 |
+
Settings.llm = llm
|
| 157 |
+
Settings.embed_model = text_embed_model
|
| 158 |
+
# Define ServiceContext with ClinicalBertEmbeddings for text
|
| 159 |
+
service_context = ServiceContext.from_defaults(
|
| 160 |
+
llm = llm,
|
| 161 |
+
embed_model=text_embed_model # Use ClinicalBert embeddings model
|
| 162 |
+
)
|
| 163 |
+
set_global_service_context(service_context)
|
| 164 |
+
# Just for logging and Debugging
|
| 165 |
+
# Log ServiceContext details
|
| 166 |
+
# logging.debug(f"LLM: {service_context.llm}")
|
| 167 |
+
# logging.debug(f"Embed Model: {service_context.embed_model}")
|
| 168 |
+
# logging.debug(f"Node Parser: {service_context.node_parser}")
|
| 169 |
+
# logging.debug(f"Prompt Helper: {service_context.prompt_helper}")
|
| 170 |
+
# Create QdrantClient with the location set to ":memory:", which means the vector db will be stored in memory
|
| 171 |
+
try:
|
| 172 |
+
text_client = qdrant_client.QdrantClient(
|
| 173 |
+
url=QDRANT_URL,
|
| 174 |
+
api_key=QDRANT_API_KEY,
|
| 175 |
+
port=443,
|
| 176 |
+
)
|
| 177 |
+
print("Qdrant client initialized successfully.")
|
| 178 |
+
except Exception as e:
|
| 179 |
+
print(f"Error initializing Qdrant client: {e}")
|
| 180 |
+
raise
|
| 181 |
+
# load Text documents from the data_wiki directory
|
| 182 |
+
# text_documents = SimpleDirectoryReader("./Data").load_data()
|
| 183 |
+
# Load documents
|
| 184 |
+
loader = DirectoryLoader("./Data/", glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)
|
| 185 |
+
documents = loader.load()
|
| 186 |
+
# Print document names
|
| 187 |
+
for doc in documents:
|
| 188 |
+
print(f"Processing document: {doc.metadata.get('source', 'Unknown')}")
|
| 189 |
+
# Split documents into chunks
|
| 190 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=70)
|
| 191 |
+
texts = text_splitter.split_documents(documents)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
print(f"Loaded {len(documents)} documents")
|
| 196 |
+
print(f"Split into {len(texts)} chunks")
|
| 197 |
+
# Convert langchain documents to llama_index documents
|
| 198 |
+
text_documents = [
|
| 199 |
+
LlamaIndexDocument(text=t.page_content, metadata=t.metadata)
|
| 200 |
+
for t in texts
|
| 201 |
+
]
|
| 202 |
+
# Initialize Qdrant vector store
|
| 203 |
+
try:
|
| 204 |
+
text_vector_store = QdrantVectorStore(
|
| 205 |
+
client=text_client, collection_name="pdf_text"
|
| 206 |
+
)
|
| 207 |
+
print("Qdrant vector store initialized successfully.")
|
| 208 |
+
except Exception as e:
|
| 209 |
+
print(f"Error initializing Qdrant vector store: {e}")
|
| 210 |
+
raise
|
| 211 |
+
|
| 212 |
+
try:
|
| 213 |
+
image_vector_store = QdrantVectorStore(
|
| 214 |
+
client=text_client, collection_name="pdf_img"
|
| 215 |
+
)
|
| 216 |
+
print("Qdrant vector store initialized successfully.")
|
| 217 |
+
except Exception as e:
|
| 218 |
+
print(f"Error initializing Qdrant vector store: {e}")
|
| 219 |
+
raise
|
| 220 |
+
|
| 221 |
+
storage_context = StorageContext.from_defaults(vector_store=text_vector_store)
|
| 222 |
+
|
| 223 |
+
wiki_text_index = VectorStoreIndex.from_documents(text_documents
|
| 224 |
+
# , storage_context=storage_context
|
| 225 |
+
, service_context=service_context
|
| 226 |
+
)
|
| 227 |
+
print(f"VectorStoreIndex created with {len(wiki_text_index.docstore.docs)} documents")
|
| 228 |
+
|
| 229 |
+
# define the streaming query engine
|
| 230 |
+
streaming_qe = wiki_text_index.as_query_engine(streaming=True)
|
| 231 |
+
print(len(text_documents))
|
| 232 |
+
|
| 233 |
+
# Function to query the text vector database
|
| 234 |
+
# Modify the process_query function
|
| 235 |
+
|
| 236 |
+
model, preprocess = clip.load("ViT-B/32")
|
| 237 |
+
input_resolution = model.visual.input_resolution
|
| 238 |
+
context_length = model.context_length
|
| 239 |
+
vocab_size = model.vocab_size
|
| 240 |
+
|
| 241 |
+
print(
|
| 242 |
+
"Model parameters:",
|
| 243 |
+
f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}",
|
| 244 |
+
)
|
| 245 |
+
print("Input resolution:", input_resolution)
|
| 246 |
+
print("Context length:", context_length)
|
| 247 |
+
print("Vocab size:", vocab_size)
|
| 248 |
+
|
| 249 |
+
pdf_directory = Path("./data")
|
| 250 |
+
image_path = Path("./images1")
|
| 251 |
+
image_path.mkdir(exist_ok=True, parents=True)
|
| 252 |
+
|
| 253 |
+
# Dictionary to store image metadata
|
| 254 |
+
image_metadata_dict = {}
|
| 255 |
+
|
| 256 |
+
# Limit the number of images downloaded per PDF
|
| 257 |
+
MAX_IMAGES_PER_PDF = 15
|
| 258 |
+
|
| 259 |
+
# Generate a UUID for each image
|
| 260 |
+
image_uuid = 0
|
| 261 |
+
|
| 262 |
+
# Iterate over each PDF file in the data folder
|
| 263 |
+
for pdf_file in pdf_directory.glob("*.pdf"):
|
| 264 |
+
images_per_pdf = 0
|
| 265 |
+
print(f"Processing: {pdf_file}")
|
| 266 |
+
|
| 267 |
+
# Extract images from the PDF
|
| 268 |
+
try:
|
| 269 |
+
raw_pdf_elements = partition_pdf(
|
| 270 |
+
filename=str(pdf_file),
|
| 271 |
+
extract_images_in_pdf=True,
|
| 272 |
+
infer_table_structure=True,
|
| 273 |
+
chunking_strategy="by_title",
|
| 274 |
+
max_characters=4000,
|
| 275 |
+
new_after_n_chars=3800,
|
| 276 |
+
combine_text_under_n_chars=2000,
|
| 277 |
+
extract_image_block_output_dir=image_path,
|
| 278 |
+
)
|
| 279 |
+
# Loop through the elements
|
| 280 |
+
except Exception as e:
|
| 281 |
+
print(f"Error processing {pdf_file}: {e}")
|
| 282 |
+
import traceback
|
| 283 |
+
traceback.print_exc()
|
| 284 |
+
continue
|
| 285 |
+
# Function to summarize images
|
| 286 |
+
def summarize_image(image_path):
|
| 287 |
+
# Load and encode the image
|
| 288 |
+
with open(image_path, "rb") as image_file:
|
| 289 |
+
encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
|
| 290 |
+
|
| 291 |
+
# Create a GenerativeModel object
|
| 292 |
+
model = GenerativeModel('gemini-1.5-flash')
|
| 293 |
+
|
| 294 |
+
# Prepare the prompt
|
| 295 |
+
prompt = """
|
| 296 |
+
You are an expert in analyzing medical images. Please provide a detailed description of this medical image, including:
|
| 297 |
+
1. You are a bot that is good at analyzing images related to Dog's health
|
| 298 |
+
2. The body part or area being examined
|
| 299 |
+
3. Any visible structures, organs, or tissues
|
| 300 |
+
4. Any abnormalities, lesions, or notable features
|
| 301 |
+
5. Any other relevant medical diagram description.
|
| 302 |
+
|
| 303 |
+
Please be as specific and detailed as possible in your analysis.
|
| 304 |
+
"""
|
| 305 |
+
|
| 306 |
+
# Generate the response
|
| 307 |
+
response = model.generate_content([
|
| 308 |
+
prompt,
|
| 309 |
+
{"mime_type": "image/jpeg", "data": encoded_image}
|
| 310 |
+
])
|
| 311 |
+
|
| 312 |
+
return response.text
|
| 313 |
+
|
| 314 |
+
# # Iterate through each file in the directory
|
| 315 |
+
for image_file in os.listdir(image_path):
|
| 316 |
+
if image_file.endswith(('.jpg', '.jpeg', '.png')):
|
| 317 |
+
# Generate a standard UUID for the image
|
| 318 |
+
image_uuid = str(uuid.uuid4())
|
| 319 |
+
image_file_name = image_file
|
| 320 |
+
image_file_path = image_path / image_file
|
| 321 |
+
# Generate image summary
|
| 322 |
+
# image_summary = generate_image_summary_with(str(image_file_path), model, feature_extractor, tokenizer, device)
|
| 323 |
+
# image_summary = generate_summary_with_lm(str(image_file_path), preprocess, model, device, tokenizer, lm_model)
|
| 324 |
+
image_summary = summarize_image(image_file_path)
|
| 325 |
+
# Construct metadata entry for the image
|
| 326 |
+
image_metadata_dict[image_uuid] = {
|
| 327 |
+
"filename": image_file_name,
|
| 328 |
+
"img_path": str(image_file_path), # Store the absolute path to the image
|
| 329 |
+
"summary": image_summary # Add the summary to the metadata
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
# Limit the number of images processed per folder
|
| 333 |
+
if len(image_metadata_dict) >= MAX_IMAGES_PER_PDF:
|
| 334 |
+
break
|
| 335 |
+
|
| 336 |
+
print(f"Number of items in image_dict: {len(image_metadata_dict)}")
|
| 337 |
+
|
| 338 |
+
# Print the metadata dictionary
|
| 339 |
+
for key, value in image_metadata_dict.items():
|
| 340 |
+
print(f"UUID: {key}, Metadata: {value}")
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def plot_images_with_opencv(image_metadata_dict):
|
| 344 |
+
original_images_urls = []
|
| 345 |
+
images_shown = 0
|
| 346 |
+
|
| 347 |
+
plt.figure(figsize=(16, 16)) # Adjust the figure size as needed
|
| 348 |
+
|
| 349 |
+
for image_id in image_metadata_dict:
|
| 350 |
+
img_path = image_metadata_dict[image_id]["img_path"]
|
| 351 |
+
if os.path.isfile(img_path):
|
| 352 |
+
try:
|
| 353 |
+
img = cv2.imread(img_path)
|
| 354 |
+
if img is not None:
|
| 355 |
+
# Convert BGR (OpenCV) to RGB (matplotlib)
|
| 356 |
+
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 357 |
+
|
| 358 |
+
plt.subplot(8, 8, len(original_images_urls) + 1)
|
| 359 |
+
plt.imshow(img_rgb)
|
| 360 |
+
plt.xticks([])
|
| 361 |
+
plt.yticks([])
|
| 362 |
+
|
| 363 |
+
original_images_urls.append(image_metadata_dict[image_id]["filename"])
|
| 364 |
+
images_shown += 1
|
| 365 |
+
if images_shown >= 64:
|
| 366 |
+
break
|
| 367 |
+
except Exception as e:
|
| 368 |
+
print(f"Error processing image {img_path}: {e}")
|
| 369 |
+
|
| 370 |
+
plt.tight_layout()
|
| 371 |
+
plt.show()
|
| 372 |
+
|
| 373 |
+
plot_images_with_opencv(image_metadata_dict)
|
| 374 |
+
# set the device to use for the CLIP model, either CUDA (GPU) or CPU, depending on availability
|
| 375 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 376 |
+
print(device)
|
| 377 |
+
# Function to preprocess image using OpenCV
|
| 378 |
+
def preprocess_image(img):
|
| 379 |
+
# Convert BGR to RGB
|
| 380 |
+
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 381 |
+
# Convert the image to a PIL Image and then preprocess
|
| 382 |
+
img_pil = Image.fromarray(img_rgb)
|
| 383 |
+
return preprocess(img_pil)
|
| 384 |
+
# Use BiomedCLIP processor for preprocessing
|
| 385 |
+
# return preprocess(images=img_pil, return_tensors="pt")
|
| 386 |
+
# return preprocess(img_pil).unsqueeze(0)
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
img_emb_dict = {}
|
| 390 |
+
with torch.no_grad():
|
| 391 |
+
for image_id in image_metadata_dict:
|
| 392 |
+
img_file_path = image_metadata_dict[image_id]["img_path"]
|
| 393 |
+
if os.path.isfile(img_file_path):
|
| 394 |
+
try:
|
| 395 |
+
# Load image using OpenCV
|
| 396 |
+
img = cv2.imread(img_file_path)
|
| 397 |
+
|
| 398 |
+
if img is not None:
|
| 399 |
+
# Preprocess image
|
| 400 |
+
image = preprocess_image(img).unsqueeze(0).to(device)
|
| 401 |
+
# image = preprocess_image(img).to(device)
|
| 402 |
+
|
| 403 |
+
# Extract image features
|
| 404 |
+
image_features = model.encode_image(image)
|
| 405 |
+
|
| 406 |
+
# Store image features
|
| 407 |
+
img_emb_dict[image_id] = image_features
|
| 408 |
+
else:
|
| 409 |
+
print(f"Failed to load image {img_file_path}")
|
| 410 |
+
except Exception as e:
|
| 411 |
+
print(f"Error processing image {img_file_path}: {e}")
|
| 412 |
+
|
| 413 |
+
len(img_emb_dict) #22 image so 22 img emb
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
# create a list of ImageDocument objects, one for each image in the dataset
|
| 419 |
+
img_documents = []
|
| 420 |
+
for image_filename in image_metadata_dict:
|
| 421 |
+
# the img_emb_dict dictionary contains the image embeddings
|
| 422 |
+
if image_filename in img_emb_dict:
|
| 423 |
+
filename = image_metadata_dict[image_filename]["filename"]
|
| 424 |
+
filepath = image_metadata_dict[image_filename]["img_path"]
|
| 425 |
+
summary = image_metadata_dict[image_filename]["summary"]
|
| 426 |
+
#print(filepath)
|
| 427 |
+
|
| 428 |
+
# create an ImageDocument for each image
|
| 429 |
+
newImgDoc = ImageDocument(
|
| 430 |
+
text=filename, metadata={"filepath": filepath, "summary": summary} # Include the summary in the metadata
|
| 431 |
+
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
# set image embedding on the ImageDocument
|
| 435 |
+
newImgDoc.embedding = img_emb_dict[image_filename].tolist()[0]
|
| 436 |
+
img_documents.append(newImgDoc)
|
| 437 |
+
|
| 438 |
+
# define storage context
|
| 439 |
+
storage_context = StorageContext.from_defaults(vector_store=image_vector_store)
|
| 440 |
+
|
| 441 |
+
# define image index
|
| 442 |
+
image_index = VectorStoreIndex.from_documents(
|
| 443 |
+
img_documents,
|
| 444 |
+
storage_context=storage_context
|
| 445 |
+
)
|
| 446 |
+
# for doc in img_documents:
|
| 447 |
+
# print(f"ImageDocument: {doc.text}, Embedding: {doc.embedding}, Metadata: {doc.metadata}")
|
| 448 |
+
|
| 449 |
+
def retrieve_results_from_image_index(query):
|
| 450 |
+
""" take a text query as input and return the most similar image from the vector store """
|
| 451 |
+
|
| 452 |
+
# first tokenize the text query and convert it to a tensor
|
| 453 |
+
text = clip.tokenize(query).to(device)
|
| 454 |
+
|
| 455 |
+
# encode the text tensor using the CLIP model to produce a query embedding
|
| 456 |
+
query_embedding = model.encode_text(text).tolist()[0]
|
| 457 |
+
# Encode the query using ClinicalBERT for text similarity
|
| 458 |
+
clinical_query_embedding = text_embed_model.embed_query(query)
|
| 459 |
+
# create a VectorStoreQuery
|
| 460 |
+
image_vector_store_query = VectorStoreQuery(
|
| 461 |
+
query_embedding=query_embedding,
|
| 462 |
+
similarity_top_k=1, # returns 1 image
|
| 463 |
+
mode="default",
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
# execute the query against the image vector store
|
| 467 |
+
image_retrieval_results = image_vector_store.query(
|
| 468 |
+
image_vector_store_query
|
| 469 |
+
)
|
| 470 |
+
if image_retrieval_results.nodes:
|
| 471 |
+
best_score = -1
|
| 472 |
+
best_image = None
|
| 473 |
+
|
| 474 |
+
for node, clip_score in zip(image_retrieval_results.nodes, image_retrieval_results.similarities):
|
| 475 |
+
image_path = node.metadata["filepath"]
|
| 476 |
+
image_summary = node.metadata.get("summary", "") # Assuming summaries are stored in metadata
|
| 477 |
+
|
| 478 |
+
# Calculate text similarity between query and image summary
|
| 479 |
+
summary_embedding = text_embed_model.embed_query(image_summary)
|
| 480 |
+
# text_score = util.cosine_similarity(
|
| 481 |
+
# [clinical_query_embedding], [summary_embedding]
|
| 482 |
+
# )[0][0]
|
| 483 |
+
# Use util.cos_sim for cosine similarity
|
| 484 |
+
text_score = util.cos_sim(torch.tensor([clinical_query_embedding]),
|
| 485 |
+
torch.tensor([summary_embedding]))[0][0].item()
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
# Calculate average similarity score
|
| 489 |
+
avg_score = (clip_score + text_score) / 2
|
| 490 |
+
|
| 491 |
+
if avg_score > best_score:
|
| 492 |
+
best_score = avg_score
|
| 493 |
+
best_image = image_path
|
| 494 |
+
|
| 495 |
+
return best_image, best_score
|
| 496 |
+
|
| 497 |
+
return None, 0.0
|
| 498 |
+
|
| 499 |
+
def plot_image_retrieve_results(image_retrieval_results):
|
| 500 |
+
""" Take a list of image retrieval results and create a new figure """
|
| 501 |
+
|
| 502 |
+
plt.figure(figsize=(16, 5))
|
| 503 |
+
|
| 504 |
+
img_cnt = 0
|
| 505 |
+
|
| 506 |
+
# Iterate over the image retrieval results, and for each result, display the corresponding image and its score in a subplot.
|
| 507 |
+
# The title of the subplot is the score of the image, formatted to four decimal places.
|
| 508 |
+
|
| 509 |
+
for returned_image, score in zip(
|
| 510 |
+
image_retrieval_results.nodes, image_retrieval_results.similarities
|
| 511 |
+
):
|
| 512 |
+
img_name = returned_image.text
|
| 513 |
+
img_path = returned_image.metadata["filepath"]
|
| 514 |
+
|
| 515 |
+
# Read image using OpenCV
|
| 516 |
+
image = cv2.imread(img_path)
|
| 517 |
+
# Convert image to RGB format (OpenCV reads in BGR by default)
|
| 518 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 519 |
+
|
| 520 |
+
plt.subplot(2, 3, img_cnt + 1)
|
| 521 |
+
plt.title("{:.4f}".format(score))
|
| 522 |
+
|
| 523 |
+
plt.imshow(image_rgb)
|
| 524 |
+
plt.xticks([])
|
| 525 |
+
plt.yticks([])
|
| 526 |
+
img_cnt += 1
|
| 527 |
+
|
| 528 |
+
plt.tight_layout()
|
| 529 |
+
plt.show()
|
| 530 |
+
def get_all_images():
|
| 531 |
+
image_paths = []
|
| 532 |
+
for _, metadata in image_metadata_dict.items():
|
| 533 |
+
image_paths.append(metadata["img_path"])
|
| 534 |
+
return image_paths
|
| 535 |
+
|
| 536 |
+
def load_image(image_path):
|
| 537 |
+
return Image.open(image_path)
|
| 538 |
+
|
| 539 |
+
# Define the combined query function
|
| 540 |
+
def combined_query(query, similarity_threshold=0.3):
|
| 541 |
+
# Text query
|
| 542 |
+
text_response = streaming_qe.query(query)
|
| 543 |
+
text_result = ""
|
| 544 |
+
for text in text_response.response_gen:
|
| 545 |
+
text_result += text
|
| 546 |
+
|
| 547 |
+
# Image query
|
| 548 |
+
top_image_path, similarity_score = retrieve_results_from_image_index(query)
|
| 549 |
+
|
| 550 |
+
if similarity_score >= similarity_threshold:
|
| 551 |
+
return text_result, top_image_path, similarity_score
|
| 552 |
+
else:
|
| 553 |
+
return text_result, None, similarity_score
|
| 554 |
+
def gradio_interface(query):
|
| 555 |
+
text_result, image_path, similarity_score = combined_query(query)
|
| 556 |
+
top_image = load_image(image_path) if image_path else None
|
| 557 |
+
all_images = [load_image(path) for path in get_all_images()]
|
| 558 |
+
return text_result, top_image, all_images, f"Similarity Score: {similarity_score:.4f}"
|
| 559 |
+
|
| 560 |
+
with gr.Blocks() as iface:
|
| 561 |
+
gr.Markdown("# Medical Knowledge Base Query System")
|
| 562 |
+
|
| 563 |
+
with gr.Row():
|
| 564 |
+
query_input = gr.Textbox(lines=2, placeholder="Enter your medical query here...")
|
| 565 |
+
submit_button = gr.Button("Submit")
|
| 566 |
+
|
| 567 |
+
with gr.Row():
|
| 568 |
+
text_output = gr.Textbox(label="Text Response")
|
| 569 |
+
image_output = gr.Image(label="Top Related Image (if similarity > threshold)")
|
| 570 |
+
|
| 571 |
+
similarity_score_output = gr.Textbox(label="Similarity Score")
|
| 572 |
+
|
| 573 |
+
gallery_output = gr.Gallery(label="All Extracted Images", show_label=True, elem_id="gallery")
|
| 574 |
+
|
| 575 |
+
submit_button.click(
|
| 576 |
+
fn=gradio_interface,
|
| 577 |
+
inputs=query_input,
|
| 578 |
+
outputs=[text_output, image_output, gallery_output, similarity_score_output]
|
| 579 |
+
)
|
| 580 |
+
|
| 581 |
+
# Load all images on startup
|
| 582 |
+
iface.load(lambda: ["", None, [load_image(path) for path in get_all_images()], ""],
|
| 583 |
+
outputs=[text_output, image_output, gallery_output, similarity_score_output])
|
| 584 |
+
# Launch the Gradio interface
|
| 585 |
+
iface.launch(share=True)
|
| 586 |
+
# just to check if it works or not
|
| 587 |
+
# def image_query(query):
|
| 588 |
+
# image_retrieval_results = retrieve_results_from_image_index(query)
|
| 589 |
+
# plot_image_retrieve_results(image_retrieval_results)
|
| 590 |
+
|
| 591 |
+
# query1 = "What is gingivitis?"
|
| 592 |
+
# # generate image retrieval results
|
| 593 |
+
# image_query(query1)
|
| 594 |
+
|
| 595 |
+
# # Modify your text query function
|
| 596 |
+
# # def text_query(query):
|
| 597 |
+
# # text_retrieval_results = process_query(query, text_embed_model, k=10)
|
| 598 |
+
# # return text_retrieval_results
|
| 599 |
+
# # Function to query the text vector database
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
# def text_query(query: str, k: int = 10):
|
| 603 |
+
# # Create a VectorStoreIndex from the existing vector store
|
| 604 |
+
# index = VectorStoreIndex.from_vector_store(text_vector_store)
|
| 605 |
+
|
| 606 |
+
# # Create a retriever with top-k configuration
|
| 607 |
+
# retriever = index.as_retriever(similarity_top_k=k)
|
| 608 |
+
|
| 609 |
+
# # Create a query engine
|
| 610 |
+
# query_engine = RetrieverQueryEngine.from_args(retriever)
|
| 611 |
+
|
| 612 |
+
# # Execute the query
|
| 613 |
+
# response = query_engine.query(query)
|
| 614 |
+
|
| 615 |
+
# return response
|
| 616 |
+
|
| 617 |
+
# # text_retrieval_results = text_query(query1)
|
| 618 |
+
# streaming_response = streaming_qe.query(
|
| 619 |
+
# query1
|
| 620 |
+
# )
|
| 621 |
+
# streaming_response.print_response_stream()
|
freeze
ADDED
|
File without changes
|
images.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from unstructured.partition.pdf import partition_pdf
|
| 2 |
+
output_path = "./images"
|
| 3 |
+
raw_pdf_elements = partition_pdf(
|
| 4 |
+
filename="./Data/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf",
|
| 5 |
+
extract_images_in_pdf=True,
|
| 6 |
+
infer_table_structure=True,
|
| 7 |
+
chunking_strategy="by_title",
|
| 8 |
+
max_characters=4000,
|
| 9 |
+
new_after_n_chars=3800,
|
| 10 |
+
combine_text_under_n_chars=2000,
|
| 11 |
+
extract_image_block_output_dir=output_path,
|
| 12 |
+
)
|
images/architecture.png
ADDED
|
images/figure-1-1.jpg
ADDED
|
images/figure-1-10.jpg
ADDED
|
images/figure-1-11.jpg
ADDED
|
images/figure-1-2.jpg
ADDED
|
images/figure-1-3.jpg
ADDED
|
images/figure-1-4.jpg
ADDED
|
images/figure-1-5.jpg
ADDED
|
images/figure-1-6.jpg
ADDED
|
images/figure-1-7.jpg
ADDED
|
images/figure-1-8.jpg
ADDED
|
images/figure-1-9.jpg
ADDED
|
images/multimodal.png
ADDED
|
Git LFS Details
|
images1/figure-1-1.jpg
ADDED
|
images1/figure-1-10.jpg
ADDED
|
images1/figure-1-11.jpg
ADDED
|
images1/figure-1-2.jpg
ADDED
|
images1/figure-1-3.jpg
ADDED
|
images1/figure-1-4.jpg
ADDED
|
images1/figure-1-5.jpg
ADDED
|
images1/figure-1-6.jpg
ADDED
|