Spaces:

baderanas
/

rag-medical

Running

App Files Files Community

baderanas commited on 25 days ago

Commit

cdf244e

verified ·

1 Parent(s): 0b87e1e

Upload 12 files

Browse files

Files changed (12) hide show

.gitignore +174 -0
README.md +1 -10
app.py +277 -0
certification_processing.log +0 -0
chroma_operations/delete_chroma.py +152 -0
chroma_operations/ingestion.py +211 -0
chroma_operations/pdf_processing.py +52 -0
chroma_operations/retrieve.py +49 -0
llms.py +114 -0
rag.py +36 -0
requirements.txt +10 -0
semantic_chunking.py +93 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc

README.md CHANGED Viewed

@@ -1,10 +1 @@
----
-title: Rag Medical
-emoji: 🔥
-colorFrom: purple
-colorTo: gray
-sdk: static
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # albariqi-rag

app.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import streamlit as st
+import os
+from chroma_operations.ingestion import ingest
+from rag import ask_question
+from chroma_operations.delete_chroma import remove_from_chroma
+import json
+# Get list of processed PDF file names without `.pdf`
+def get_processed_file_names(folder_path="docs/processed"):
+    try:
+        files = os.listdir(folder_path)
+        pdfs = [f[:-4] for f in files if f.endswith(".pdf")]
+        return sorted(pdfs)
+    except Exception as e:
+        st.error(f"Error reading folder: {e}")
+        return []
+st.set_page_config(page_title="RAG Demo", layout="centered")
+st.title("📄 Retrieval-Augmented Generation (RAG) Demo")
+# Create tabs for different functionalities
+tab1, tab2, tab3 = st.tabs(["Ask Questions", "Upload Documents", "Manage Files"])
+with tab1:
+    st.markdown("Ask a question based on a specific processed document.")
+    # Add a refresh button
+    if st.button("🔄 Refresh Document List"):
+        st.success("Document list refreshed!")
+        # No need to do anything else - Streamlit will rerun and refresh the list
+    # Fetch available document names
+    doc_names = get_processed_file_names()
+    # Select box with search
+    file_name = st.selectbox(
+        "📁 Select a Document",
+        doc_names,
+        index=0 if doc_names else None,
+        placeholder="Type to search...",
+    )
+    # User question
+    query_text = st.text_input(
+        "🧠 Your Question",
+        placeholder="e.g. What are the treatment steps for diabetes?",
+    )
+    if st.button("Ask"):
+        if not query_text or not file_name:
+            st.warning("Please fill in both the question and select a document.")
+        else:
+            with st.spinner("Processing..."):
+                try:
+                    response = ask_question(query_text, file_name)
+                    if response:
+                        st.success("✅ Answer:")
+                        st.markdown(f"**{response['answer']}**")
+                        with st.expander("📚 Retrieved Chunks"):
+                            for i, chunk in enumerate(response["chunks"]):
+                                st.markdown(f"**Chunk {i+1}:** {chunk}")
+                    else:
+                        st.error(f"Error in the answer")
+                except Exception as e:
+                    st.error(f"Failed to connect to the backend: {e}")
+with tab2:
+    st.markdown("Upload new documents to be processed for the RAG system.")
+    # Ensure directories exist
+    os.makedirs("docs/unprocessed", exist_ok=True)
+    os.makedirs("docs/processed", exist_ok=True)
+    # File uploader
+    uploaded_file = st.file_uploader("Upload PDF Document", type=["pdf"])
+    if uploaded_file is not None:
+        st.info(f"File '{uploaded_file.name}' ready for upload")
+    # Create columns for buttons
+    col1, col2 = st.columns(2)
+    # Upload button
+    if col1.button("Upload to System"):
+        try:
+            # Save the uploaded file to the docs/unprocessed directory
+            with open(os.path.join("docs/unprocessed", uploaded_file.name), "wb") as f:
+                f.write(uploaded_file.getbuffer())
+            st.success(f"File '{uploaded_file.name}' saved to docs/unprocessed/")
+        except Exception as e:
+            st.error(f"Error saving file: {e}")
+    # Ingest button
+    if col2.button("Process Document"):
+        try:
+            with st.spinner("Processing document... This may take a while."):
+                # Call the ingestion script
+                result = ingest()
+                if result:
+                    st.success("Document processed successfully!")
+                    # Refresh the list of available documents
+                    doc_names = get_processed_file_names()
+                else:
+                    st.error(f"Error processing document")
+        except Exception as e:
+            st.error(f"Error running ingestion process: {e}")
+    # Display list of files in unprocessed folder
+    st.subheader("Unprocessed Documents")
+    try:
+        unprocessed_files = os.listdir("docs/unprocessed")
+        if unprocessed_files:
+            for file in unprocessed_files:
+                st.text(f"• {file}")
+        else:
+            st.info("No unprocessed documents.")
+    except Exception as e:
+        st.error(f"Error reading unprocessed folder: {e}")
+    # Display list of processed files
+    st.subheader("Processed Documents")
+    try:
+        processed_files = os.listdir("docs/processed")
+        processed_files = [f for f in processed_files if f.endswith(".pdf")]
+        if processed_files:
+            for file in processed_files:
+                st.text(f"• {file}")
+        else:
+            st.info("No processed documents.")
+    except Exception as e:
+        st.error(f"Error reading processed folder: {e}")
+with tab3:
+    st.markdown(
+        "Manage your documents by deleting files from processed or unprocessed folders."
+    )
+    col1, col2 = st.columns(2)
+    with col1:
+        st.subheader("Delete Unprocessed Documents")
+        try:
+            unprocessed_files = os.listdir("docs/unprocessed")
+            if unprocessed_files:
+                file_to_delete_unprocessed = st.selectbox(
+                    "Select file to delete from unprocessed folder",
+                    unprocessed_files,
+                    key="unprocessed_select",
+                )
+                if st.button("Delete Unprocessed File", key="delete_unprocessed"):
+                    try:
+                        file_path = os.path.join(
+                            "docs/unprocessed", file_to_delete_unprocessed
+                        )
+                        os.remove(file_path)
+                        st.success(f"Successfully deleted {file_to_delete_unprocessed}")
+                        # Force refresh the app to show the updated file list
+                        st.rerun()
+                    except Exception as e:
+                        st.error(f"Error deleting file: {e}")
+            else:
+                st.info("No unprocessed documents to delete.")
+        except Exception as e:
+            st.error(f"Error accessing unprocessed folder: {e}")
+    with col2:
+        st.subheader("Delete Processed Documents")
+        try:
+            processed_files = [
+                f for f in os.listdir("docs/processed") if f.endswith(".pdf")
+            ]
+            if processed_files:
+                file_to_delete_processed = st.selectbox(
+                    "Select file to delete from processed folder",
+                    processed_files,
+                    key="processed_select",
+                )
+                if st.button("Delete Processed File", key="delete_processed"):
+                    try:
+                        # Delete the PDF file
+                        pdf_path = os.path.join(
+                            "docs/processed", file_to_delete_processed
+                        )
+                        os.remove(pdf_path)
+                        # Also delete the corresponding vector store if it exists
+                        base_name = file_to_delete_processed[
+                            :-4
+                        ]  # Remove .pdf extension
+                        vector_store_path = os.path.join(
+                            "docs/processed", f"{base_name}.faiss"
+                        )
+                        if os.path.exists(vector_store_path):
+                            os.remove(vector_store_path)
+                        # Delete metadata file if it exists
+                        metadata_path = os.path.join(
+                            "docs/processed", f"{base_name}_metadata.json"
+                        )
+                        if os.path.exists(metadata_path):
+                            os.remove(metadata_path)
+                        # Remove document from Chroma DB
+                        with st.spinner("Removing document from vector database..."):
+                            remove_from_chroma(base_name)
+                        st.success(
+                            f"Successfully deleted {file_to_delete_processed} and related files"
+                        )
+                        # Force refresh the app to show the updated file list
+                        st.rerun()
+                    except Exception as e:
+                        st.error(f"Error deleting file: {e}")
+            else:
+                st.info("No processed documents to delete.")
+        except Exception as e:
+            st.error(f"Error accessing processed folder: {e}")
+    # Add a separator
+    st.markdown("---")
+    # Delete all files section
+    st.subheader("Bulk Operations")
+    col3, col4 = st.columns(2)
+    with col3:
+        if st.button(
+            "Delete ALL Unprocessed Files", type="primary", use_container_width=True
+        ):
+            try:
+                unprocessed_files = os.listdir("docs/unprocessed")
+                if unprocessed_files:
+                    for file in unprocessed_files:
+                        os.remove(os.path.join("docs/unprocessed", file))
+                    st.success(
+                        f"Successfully deleted all {len(unprocessed_files)} unprocessed files"
+                    )
+                    # Force refresh
+                    st.rerun()
+                else:
+                    st.info("No files to delete.")
+            except Exception as e:
+                st.error(f"Error during bulk deletion: {e}")
+    with col4:
+        if st.button(
+            "Delete ALL Processed Files", type="primary", use_container_width=True
+        ):
+            try:
+                processed_files = os.listdir("docs/processed")
+                if processed_files:
+                    for file in processed_files:
+                        file_path = os.path.join("docs/processed", file)
+                        os.remove(file_path)
+                        # If it's a PDF file, also remove from Chroma
+                        if file.endswith(".pdf"):
+                            base_name = file[:-4]  # Remove .pdf extension
+                            remove_from_chroma(base_name)
+                    st.success(
+                        f"Successfully deleted all {len(processed_files)} processed files"
+                    )
+                    # Force refresh
+                    st.rerun()
+                else:
+                    st.info("No files to delete.")
+            except Exception as e:
+                st.error(f"Error during bulk deletion: {e}")

certification_processing.log ADDED Viewed

File without changes

chroma_operations/delete_chroma.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import logging
+from typing import Optional
+import chromadb
+from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
+from dotenv import load_dotenv
+import shutil
+# Load environment variables
+load_dotenv()
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+def remove_from_chroma(
+    document_name: str, collection_name: str = "rag_collection"
+) -> bool:
+    """
+    Remove a document and its chunks from the Chroma vector database.
+    Args:
+        document_name (str): The base name of the document (without .pdf extension)
+        collection_name (str): Name of the collection in ChromaDB (default: "rag_collection")
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    try:
+        logger.info(f"Attempting to remove document '{document_name}' from Chroma DB")
+        # Check if Chroma DB exists
+        chroma_path = "./chroma_db"
+        if not os.path.exists(chroma_path):
+            logger.warning("Chroma DB directory does not exist")
+            return False
+        # Initialize embedding function and Chroma client
+        embedding_function = OpenAIEmbeddingFunction(
+            api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
+        )
+        # Connect to the persistent client
+        client = chromadb.PersistentClient(path=chroma_path)
+        # Get the collection
+        try:
+            collection = client.get_collection(
+                name=collection_name, embedding_function=embedding_function
+            )
+        except Exception as e:
+            logger.error(f"Collection '{collection_name}' not found: {e}")
+            return False
+        # Delete documents where source_file matches the document_name
+        try:
+            # First, get the IDs of chunks belonging to this document
+            results = collection.get(where={"source_file": document_name})
+            ids_to_delete = results.get("ids", [])
+            if not ids_to_delete:
+                logger.warning(
+                    f"No chunks found for document '{document_name}' in collection"
+                )
+                return True  # Nothing to delete, so consider it successful
+            # Delete chunks by IDs
+            collection.delete(ids=ids_to_delete)
+            logger.info(
+                f"Successfully deleted {len(ids_to_delete)} chunks for '{document_name}' from ChromaDB"
+            )
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting chunks from collection: {e}")
+            return False
+    except Exception as e:
+        logger.error(f"Error removing document from Chroma DB: {e}")
+        return False
+def delete_all_from_chroma(collection_name: str = "rag_collection") -> bool:
+    """
+    Delete all documents from the specified ChromaDB collection.
+    Args:
+        collection_name (str): Name of the collection in ChromaDB (default: "rag_collection")
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    try:
+        # Initialize embedding function and Chroma client
+        embedding_function = OpenAIEmbeddingFunction(
+            api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
+        )
+        # Connect to the persistent client
+        client = chromadb.PersistentClient(path="./chroma_db")
+        try:
+            # Get the collection
+            collection = client.get_collection(
+                name=collection_name, embedding_function=embedding_function
+            )
+            # Delete all documents in the collection
+            collection.delete()
+            logger.info(
+                f"Successfully deleted all documents from collection '{collection_name}'"
+            )
+            return True
+        except Exception as e:
+            logger.error(f"Error accessing or deleting collection: {e}")
+            return False
+    except Exception as e:
+        logger.error(f"Error connecting to Chroma DB: {e}")
+        return False
+def reset_chroma_db() -> bool:
+    """
+    Reset the entire Chroma database by deleting and recreating the directory.
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    try:
+        chroma_path = "./chroma_db"
+        if os.path.exists(chroma_path):
+            # Delete the entire Chroma directory
+            shutil.rmtree(chroma_path)
+            logger.info("Successfully deleted entire Chroma DB directory")
+            # Create an empty directory
+            os.makedirs(chroma_path, exist_ok=True)
+            logger.info("Created fresh Chroma DB directory")
+            return True
+        else:
+            logger.warning("Chroma DB directory does not exist")
+            return False
+    except Exception as e:
+        logger.error(f"Error resetting Chroma DB: {e}")
+        return False

chroma_operations/ingestion.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import shutil
+import logging
+from datetime import datetime
+import hashlib
+from typing import List
+import chromadb
+import openai
+import os
+import shutil
+import logging
+from datetime import datetime
+import hashlib
+from typing import List, Optional
+import chromadb
+from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
+from dotenv import load_dotenv
+load_dotenv()
+from semantic_chunking import semantic_chunking
+from chroma_operations.pdf_processing import extract_pdf_content
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[
+        logging.FileHandler("certification_processing.log"),
+        logging.StreamHandler(),
+    ],
+)
+logger = logging.getLogger(__name__)
+def generate_chunk_id(file: str, chunk: str, position: int) -> str:
+    unique_str = f"{file}_{position}_{chunk}"
+    return hashlib.sha256(unique_str.encode()).hexdigest()
+def move_processed_file(source_path: str, destination_dir: str) -> bool:
+    """Move a processed file to the destination directory."""
+    try:
+        if not os.path.exists(destination_dir):
+            os.makedirs(destination_dir)
+        destination_path = os.path.join(destination_dir, os.path.basename(source_path))
+        os.rename(source_path, destination_path)
+        return True
+    except Exception as e:
+        logger.error(f"Error moving file {source_path}: {str(e)}")
+        return False
+def get_chroma_client():
+    """Initialize ChromaDB client with OpenAI embeddings."""
+    try:
+        # Initialize embedding function
+        embedding_function = OpenAIEmbeddingFunction(
+            api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
+        )
+        client = chromadb.PersistentClient(path="./chroma_db")
+        logger.info("Successfully connected to ChromaDB with OpenAI embeddings")
+        return client, embedding_function
+    except Exception as e:
+        logger.error(f"Error connecting to ChromaDB: {str(e)}")
+        logger.exception("Detailed stack trace:")
+        return None, None
+def create_chroma_collection(
+    client, embedding_function, collection_name="rag_collection"
+):
+    """Create or get a ChromaDB collection with proper embedding function."""
+    try:
+        collection = client.get_or_create_collection(
+            name=collection_name,
+            embedding_function=embedding_function,
+            metadata={"description": "medical documents"},
+        )
+        logger.info(f"Initialized collection: {collection_name}")
+        return collection
+    except Exception as e:
+        logger.error(f"Error creating ChromaDB collection: {str(e)}")
+        logger.exception("Detailed stack trace:")
+        return None
+def process_file(file_path: str, collection) -> bool:
+    """Process a file and add to ChromaDB collection."""
+    try:
+        if file_path.endswith(".pdf"):
+            contents = extract_pdf_content(file_path)
+        else:
+            logger.warning(f"Skipping unsupported file type: {file_path}")
+            return False
+        if not contents:
+            logger.warning(f"No content extracted from file: {file_path}")
+            return False
+        chunks: List[str] = []
+        for content in contents:
+            if not content.strip():
+                continue
+            if "\t" in content or "[TABLE]" in content:
+                chunks.append(content)
+            else:
+                try:
+                    chunks.extend(semantic_chunking(content))
+                except Exception as e:
+                    logger.error(
+                        f"Error during chunking for file {file_path}: {str(e)}"
+                    )
+                    continue
+        if not chunks:
+            logger.warning(f"No valid chunks created for file: {file_path}")
+            return False
+        documents = []
+        metadatas = []
+        ids = []
+        source_file = os.path.basename(file_path)
+        if source_file.endswith(".pdf"):
+            source_file = source_file[:-4].strip()
+        for i, chunk in enumerate(chunks):
+            if not chunk.strip():
+                continue
+            try:
+                chunk_id = generate_chunk_id(file_path, chunk, i)
+                documents.append(chunk)
+                metadatas.append(
+                    {
+                        "chunk_id": chunk_id,
+                        "source_file": source_file,
+                        "position": i,
+                        "processed_at": datetime.now().isoformat(),
+                    }
+                )
+                ids.append(chunk_id)
+            except Exception as e:
+                logger.error(f"Error processing chunk from file {file_path}: {str(e)}")
+                continue
+        if documents:
+            try:
+                # Chroma will automatically generate embeddings using the collection's embedding function
+                collection.add(documents=documents, metadatas=metadatas, ids=ids)
+                logger.info(
+                    f"Added {len(documents)} chunks from {file_path} to ChromaDB"
+                )
+                return True
+            except Exception as e:
+                logger.error(f"Error adding documents to ChromaDB: {str(e)}")
+                return False
+        return False
+    except Exception as e:
+        logger.error(f"Error processing file {file_path}: {str(e)}")
+        return False
+def ingest():
+    try:
+        # Get client and embedding function together
+        chroma_client, embedding_function = get_chroma_client()
+        if not chroma_client or not embedding_function:
+            logger.error("Failed to initialize ChromaDB with embeddings")
+            return False
+        collection = create_chroma_collection(chroma_client, embedding_function)
+        if not collection:
+            logger.error("Failed to create or get ChromaDB collection")
+            return False
+        logger.info(f"Collection ready: {collection.name}")
+        unprocessed_dir = "docs/unprocessed"
+        processed_dir = "docs/processed"
+        if not os.path.exists(unprocessed_dir):
+            logger.error(f"Directory not found: {unprocessed_dir}")
+            return False
+        for file in os.listdir(unprocessed_dir):
+            file_path = os.path.join(unprocessed_dir, file)
+            if not os.path.isfile(file_path) or not file.lower().endswith(".pdf"):
+                continue
+            logger.info(f"Processing file: {file_path}")
+            if process_file(file_path, collection):
+                if not move_processed_file(file_path, processed_dir):
+                    logger.error(f"Failed to move processed file: {file_path}")
+            else:
+                logger.error(f"Failed to process file: {file_path}")
+        logger.info("Processing completed")
+        return True
+    except Exception as e:
+        logger.error(f"Fatal error in ingestion: {str(e)}")
+        logger.exception("Detailed stack trace:")
+        return False

chroma_operations/pdf_processing.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import pdfplumber
+import logging
+from typing import List, Union, Tuple
+import os
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def extract_pdf_content(pdf_path: str) -> List[str]:
+    """
+    Extract text and tables from PDF in their natural reading order.
+    Simplified version without positional processing.
+    Args:
+        pdf_path (str): Path to the PDF file
+    Returns:
+        List[str]: List of extracted content chunks (text and tables)
+    """
+    if not os.path.exists(pdf_path):
+        logger.error(f"PDF file not found: {pdf_path}")
+        return []
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            content = []
+            for page in pdf.pages:
+                # First extract tables
+                tables = page.extract_tables()
+                for table in tables:
+                    if table:
+                        # Convert table to string representation
+                        table_str = "\n".join(
+                            ["\t".join(str(cell) for cell in row) for row in table]
+                        )
+                        content.append(f"[TABLE]\n{table_str}\n[/TABLE]")
+                # Then extract regular text
+                text = page.extract_text()
+                if text and text.strip():
+                    content.append(text.strip())
+            logger.info(f"Successfully extracted content from {pdf_path}")
+            return content
+    except Exception as e:
+        logger.error(f"Error processing {pdf_path}: {str(e)}")
+        return []

chroma_operations/retrieve.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import logging
+from typing import List, Optional
+import chromadb
+from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def search_similar_chunks(
+    query_text: str,
+    document_name: str,
+    collection_name: str = "rag_collection",
+    top_k: int = 5,
+):
+    """Search for top-k chunks similar to query_text within a specific document (source_file)."""
+    try:
+        # Initialize embedding function and Chroma client
+        embedding_function = OpenAIEmbeddingFunction(
+            api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
+        )
+        client = chromadb.PersistentClient(path="./chroma_db")
+        # Load the collection
+        collection = client.get_collection(
+            name=collection_name, embedding_function=embedding_function
+        )
+        # Query similar documents filtered by document_name
+        results = collection.query(
+            query_texts=[query_text],
+            n_results=top_k,
+            where={"source_file": document_name},
+        )
+        documents = results.get("documents", [[]])[0]
+        metadatas = results.get("metadatas", [[]])[0]
+        return documents
+    except Exception as e:
+        logger.error(f"Similarity search failed: {str(e)}")
+        return []

llms.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+from dotenv import load_dotenv
+from langchain.prompts import PromptTemplate
+from langchain_groq import ChatGroq
+from typing import Literal
+# Load environment variables
+load_dotenv()
+# Initialize LLMs
+def initialize_llms():
+    """Initialize and return the LLM instances"""
+    groq_api_key = os.getenv("GROQ_API_KEY")
+    return {
+        "llm": ChatGroq(
+            temperature=0.1, model="llama-3.3-70b-versatile", api_key=groq_api_key
+        ),
+        "step_back_llm": ChatGroq(
+            temperature=0, model="Gemma2-9B-IT", api_key=groq_api_key
+        ),
+    }
+# Query refinement
+def refine_query(query: str, llm: ChatGroq) -> str:
+    """Enhance pediatric medicine queries for better retrieval while preserving clinical intent"""
+    template = """
+You are a medical language expert. Your task is to improve the following user question by:
+- Correcting any grammatical or spelling errors
+- Clarifying vague or ambiguous wording
+- Improving sentence structure for readability and precision
+- Maintaining the original meaning and clinical focus
+Do not add new information. Do not expand abbreviations unless they are unclear. Do not include any commentary or explanation.
+Original query: {original_query}
+Improved medical question:
+"""
+    prompt = PromptTemplate(input_variables=["original_query"], template=template)
+    chain = prompt | llm
+    return chain.invoke({"original_query": query}).content
+def query_to_retrieve(query, llm):
+    """Convert a query to a format suitable for retrieval"""
+    template = """
+You are an expert in pediatric medical information retrieval.
+Your task is to rewrite the following question into a single, concise sentence containing only the most relevant medical and pediatric concepts. This sentence will be used for semantic search in a vector database.
+Instructions:
+- Include only the core clinical focus (conditions, symptoms, treatments, procedures).
+- Mention pediatric-specific entities if relevant (e.g., age group, child-specific medication).
+- Remove all conversational language and filler.
+- Preserve the original intent.
+- Output only one clean, search-optimized sentence.
+Original query: {original_query}
+Search-ready query:
+"""
+    prompt = PromptTemplate(input_variables=["original_query"], template=template)
+    chain = prompt | llm
+    return chain.invoke({"original_query": query}).content
+def answer_query_with_chunks(
+    query: str,
+    retrieved_docs,
+    llm: ChatGroq,
+) -> str:
+    try:
+        # Embed query using the same embedding function
+        query_improved = refine_query(query, llm)
+        if not retrieved_docs:
+            return "Sorry, no relevant medical information was found."
+        # Construct context for the LLM
+        context = "\n\n".join(retrieved_docs)
+        system_prompt = """
+You are a pediatric medical assistant.
+Based only on the provided context, answer the user's question concisely and accurately but with the necessary explanation.
+If the answer is not present in the context, say: "The answer is not available in the current documents."
+Context:
+{context}
+User question:
+{query}
+Answer:
+"""
+        prompt = PromptTemplate(
+            input_variables=["context", "query"],
+            template=system_prompt,
+        )
+        chain = prompt | llm
+        return chain.invoke({"context": context, "query": query_improved}).content
+    except Exception as e:
+        return f"An error occurred while answering the query: {str(e)}"

rag.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from llms import initialize_llms, answer_query_with_chunks, query_to_retrieve
+from chroma_operations.retrieve import search_similar_chunks
+# Initialize FastAPI
+app = FastAPI()
+# Define request model
+class RAGRequest(BaseModel):
+    query_text: str
+    file_name: str
+    collection_name: str = "rag_collection"
+# Load LLM once at startup
+llms = initialize_llms()
+llm = llms["llm"]
+@app.post("/ask")
+def ask_question(query_text, file_name, collection_name="rag_collection"):
+    try:
+        query_search = query_to_retrieve(query_text, llm)
+        retrieved_docs = search_similar_chunks(
+            query_search, file_name, collection_name
+        )
+        if not retrieved_docs:
+            raise HTTPException(status_code=404, detail="No matching documents found.")
+        answer = answer_query_with_chunks(query_text, retrieved_docs, llm)
+        return {"answer": answer, "chunks": retrieved_docs}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+sentence-transformers
+pdfplumber
+langchain
+langchain-community
+chroma
+openai
+streamlit
+requests
+chromadb
+langchain-groq

semantic_chunking.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import numpy as np
+from sentence_transformers import SentenceTransformer
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+def hybrid_split(text: str, max_len: int = 1024) -> list[str]:
+    """
+    Split text into chunks respecting sentence boundaries when possible,
+    with optional overlap between chunks.
+    Args:
+        text: The text to split
+        max_len: Maximum length for each chunk
+    Returns:
+        List of text chunks
+    """
+    # Normalize text
+    text = text.replace("\r", "").replace("\n", " ").strip()
+    # Extract sentences (more robust regex for sentence detection)
+    import re
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(sentence) > max_len:
+            # First add the current chunk if it exists
+            chunks.append(sentence)
+        # Normal case - see if adding the sentence exceeds max_len
+        elif len(current_chunk) + len(sentence) + 1 > max_len:
+            # Add the current chunk and start a new one
+            chunks.append(current_chunk)
+            current_chunk = ""
+        else:
+            # Add to the current chunk
+            if current_chunk:
+                current_chunk += " " + sentence
+            else:
+                current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+def cosine_similarity(vec1, vec2):
+    """Calculate the cosine similarity between two vectors."""
+    dot_product = np.dot(vec1, vec2)
+    norm_vec1 = np.linalg.norm(vec1)
+    norm_vec2 = np.linalg.norm(vec2)
+    return dot_product / (norm_vec1 * norm_vec2)
+def get_embedding(text):
+    """Generate an embedding using SBERT."""
+    return embedding_model.encode(text, convert_to_numpy=True)
+def semantic_chunking(text, threshold=0.75, max_chunk_size=8191):
+    """
+    Splits text into semantic chunks based on sentence similarity.
+    - threshold: Lower = more splits, Higher = fewer splits
+    - max_chunk_size: Maximum size of each chunk in characters
+    """
+    text = text.replace("\n", " ").replace("\r", " ").strip()
+    sentences = hybrid_split(text)
+    embeddings = [get_embedding(sent) for sent in sentences]
+    chunks = []
+    current_chunk = [sentences[0]]
+    for i in range(1, len(sentences)):
+        sim = cosine_similarity(embeddings[i - 1], embeddings[i])
+        if (
+            sim < threshold
+            or len(" ".join(current_chunk + [sentences[i]])) > max_chunk_size
+        ):
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [sentences[i]]
+        else:
+            current_chunk.append(sentences[i])
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks