PDF_Recogni

Build error

App Files Files Community

Abhinav Gavireddi commited on Apr 26

Commit

33f4e34

1 Parent(s): 3301b3c

fix: removed redis to store embeddings in memory

Browse files

Files changed (10) hide show

.github/workflows/ci.yaml +30 -13
Dockerfile +8 -6
src/app.py → app.py +73 -42
requirements.txt +8 -2
src/__init__.py +2 -1
src/config.py +3 -1
src/gpp.py +39 -69
src/qa.py +45 -35
src/retriever.py +82 -39
src/utils.py +22 -27

.github/workflows/ci.yaml CHANGED Viewed

@@ -8,29 +8,46 @@ on:
 jobs:
   build-and-test:
-    # … your existing test setup …
   deploy-to-hf:
     needs: build-and-test
     runs-on: ubuntu-latest
-    if: github.ref == 'refs/heads/main'  # only on main branch
     steps:
       - name: Checkout repo
         uses: actions/checkout@v3
         with:
-          fetch-depth: 0            # needed to push full history
-      - name: Set up Docker credentials
-        run: echo "${{ secrets.HF_TOKEN }}" | docker login --username ${{ secrets.HF_USERNAME }} --password-stdin docker.pkg.github.com
-      - name: Install Hugging Face CLI
-        run: pip install huggingface_hub
-      - name: Log in to Hugging Face
-        run: |
-          huggingface-cli login --token ${{ secrets.HF_TOKEN }}
-      - name: Push to Hugging Face Space
         run: |
-          git remote add hf https://huggingface.co/spaces/${{ secrets.HF_USERNAME }}/${{ secrets.HF_SPACE_NAME }}.git
-          git push hf main --force

 jobs:
   build-and-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Run tests
+        run: |
+          if [ -f tests/test.py ]; then python -m unittest discover -s tests; fi
   deploy-to-hf:
     needs: build-and-test
     runs-on: ubuntu-latest
+    if: github.ref == 'refs/heads/main'
     steps:
       - name: Checkout repo
         uses: actions/checkout@v3
         with:
+          fetch-depth: 0
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Log in to Hugging Face Docker
+        run: echo "${{ secrets.HF_TOKEN }}" | docker login --username ${{ secrets.HF_USERNAME }} --password-stdin docker.io
+      - name: Build Docker image
+        run: docker build -t docker.io/${{ secrets.HF_USERNAME }}/${{ secrets.HF_SPACE_NAME }}:latest .
+      - name: Push Docker image to Hugging Face
+        run: docker push docker.io/${{ secrets.HF_USERNAME }}/${{ secrets.HF_SPACE_NAME }}:latest
+      # Optionally, trigger a Space restart via the API (not strictly required)
+      - name: Restart Hugging Face Space
         run: |
+          pip install huggingface_hub
+          python -c "from huggingface_hub import HfApi; HfApi(token='${{ secrets.HF_TOKEN }}').restart_space('${{ secrets.HF_USERNAME }}', '${{ secrets.HF_SPACE_NAME }}')"

Dockerfile CHANGED Viewed

@@ -1,6 +1,9 @@
 # Base image
 FROM python:3.10-slim
 # Set working directory
 WORKDIR /app
@@ -9,7 +12,9 @@ RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     build-essential \
     ffmpeg \
-    libgomp1 \         # for hnswlib (needed for OpenMP)
     && rm -rf /var/lib/apt/lists/*
 # Copy and install Python dependencies
@@ -21,15 +26,12 @@ COPY src/ ./src/
 COPY tests/ ./tests/
 COPY app.py .
-# Copy env file if you want local dev (optional)
-# COPY .env .env
 # Expose Streamlit port
-EXPOSE 8501
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
 ENV TOKENIZERS_PARALLELISM=false
 # Start Streamlit
-ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

 # Base image
 FROM python:3.10-slim
+RUN useradd -m -u 1000 user
+USER user
 # Set working directory
 WORKDIR /app
     apt-get install -y --no-install-recommends \
     build-essential \
     ffmpeg \
+    # for hnswlib (needed for OpenMP)
+    libgomp1 \
     && rm -rf /var/lib/apt/lists/*
 # Copy and install Python dependencies
 COPY tests/ ./tests/
 COPY app.py .
 # Expose Streamlit port
+EXPOSE 7860
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
 ENV TOKENIZERS_PARALLELISM=false
 # Start Streamlit
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

src/app.py → app.py RENAMED Viewed

@@ -1,6 +1,8 @@
 import os
 import streamlit as st
 from datetime import datetime
 from src.gpp import GPP, GPPConfig
 from src.qa import AnswerGenerator
@@ -60,42 +62,65 @@ with col1:
     st.header("1. Upload & Layout")
     uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
     if uploaded_file:
-        if st.button("Parse Document"):
-            output_dir = os.path.join("./parsed", uploaded_file.name)
-            os.makedirs(output_dir, exist_ok=True)
-            pdf_path = os.path.join(output_dir, uploaded_file.name)
-            with open(pdf_path, "wb") as f:
-                f.write(uploaded_file.getbuffer())
-            with st.spinner("Parsing document with MinerU and LLM...⏳"):
-                gpp = GPP(GPPConfig())
-                parsed = gpp.run(pdf_path, output_dir)
-            st.success("✅ Parsing complete!")
-            st.session_state.parsed = parsed
     parsed = st.session_state.parsed
     if parsed:
-        st.subheader("Layout Preview")
-        layout_pdf = parsed.get("layout_pdf")
-        if layout_pdf and os.path.exists(layout_pdf):
-            st.markdown(f"[Open Layout PDF]({layout_pdf})")
-        st.subheader("Extracted Content (Preview)")
-        md_path = parsed.get("md_path")
-        if md_path and os.path.exists(md_path):
-            md_text = open(md_path, 'r', encoding='utf-8').read()
-            st.markdown(f"<div class='card'><pre>{md_text[:2000]}{'...' if len(md_text)>2000 else ''}</pre></div>", unsafe_allow_html=True)
 # --- Center Column: Q&A ---
 with col2:
     st.header("2. Ask a Question")
     if parsed:
-        question = st.text_input("Type your question here:", placeholder="E.g., 'What was the Q2 revenue?'" )
-        if st.button("Get Answer") and question:
-            with st.spinner("Retrieving answer...🤖"):
-                generator = AnswerGenerator()
-                answer, supporting_chunks = generator.answer(parsed['chunks'], question)
-            st.markdown(f"<div class='card'><h3>Answer</h3><p>{answer}</p></div>", unsafe_allow_html=True)
-            st.markdown("<div class='card'><h4>Supporting Context</h4></div>", unsafe_allow_html=True)
-            for sc in supporting_chunks:
-                st.write(f"- {sc['narration']}")
     else:
         st.info("Upload and parse a document to ask questions.")
@@ -103,18 +128,24 @@ with col2:
 with col3:
     st.header("3. Relevant Chunks")
     if parsed:
-        chunks = parsed.get('chunks', [])
-        for idx, chunk in enumerate(chunks):
-            with st.expander(f"Chunk {idx} - {chunk['type'].capitalize()}"):
-                st.write(chunk.get('narration', ''))
-                if 'table_structure' in chunk:
-                    st.write("**Parsed Table:**")
-                    st.table(chunk['table_structure'])
-                for blk in chunk.get('blocks', []):
-                    if blk.get('type') == 'img_path':
-                        img_path = os.path.join(parsed['images_dir'], blk.get('img_path',''))
-                        if os.path.exists(img_path):
-                            st.image(img_path, caption=os.path.basename(img_path))
-        st.info(f"Total chunks: {len(chunks)}")
     else:
         st.info("No chunks to display. Parse a document first.")

 import os
 import streamlit as st
 from datetime import datetime
+import re
+from werkzeug.utils import secure_filename
 from src.gpp import GPP, GPPConfig
 from src.qa import AnswerGenerator
     st.header("1. Upload & Layout")
     uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
     if uploaded_file:
+        try:
+            filename = secure_filename(uploaded_file.name)
+            if not re.match(r'^[\w\-. ]+$', filename):
+                st.error("Invalid file name.")
+            elif st.button("Parse Document"):
+                output_dir = os.path.join("./parsed", filename)
+                os.makedirs(output_dir, exist_ok=True)
+                pdf_path = os.path.join(output_dir, filename)
+                with open(pdf_path, "wb") as f:
+                    f.write(uploaded_file.getbuffer())
+                with st.spinner("Parsing document with MinerU and LLM...⏳"):
+                    try:
+                        gpp = GPP(GPPConfig())
+                        parsed = gpp.run(pdf_path, output_dir)
+                        st.success("✅ Parsing complete!")
+                        st.session_state.parsed = parsed
+                    except Exception as e:
+                        st.error(f"Parsing failed: {e}")
+                        st.session_state.parsed = None
+        except Exception as e:
+            st.error(f"File upload failed: {e}")
     parsed = st.session_state.parsed
     if parsed:
+        try:
+            st.subheader("Layout Preview")
+            layout_pdf = parsed.get("layout_pdf")
+            if layout_pdf and os.path.exists(layout_pdf):
+                st.markdown(f"[Open Layout PDF]({layout_pdf})")
+            st.subheader("Extracted Content (Preview)")
+            md_path = parsed.get("md_path")
+            if md_path and os.path.exists(md_path):
+                try:
+                    with open(md_path, 'r', encoding='utf-8') as md_file:
+                        md_text = md_file.read()
+                    st.markdown(f"<div class='card'><pre>{md_text[:2000]}{'...' if len(md_text)>2000 else ''}</pre></div>", unsafe_allow_html=True)
+                except Exception as e:
+                    st.error(f"Error reading markdown: {e}")
+        except Exception as e:
+            st.error(f"Error displaying preview: {e}")
 # --- Center Column: Q&A ---
 with col2:
     st.header("2. Ask a Question")
     if parsed:
+        try:
+            question = st.text_input("Type your question here:", placeholder="E.g., 'What was the Q2 revenue?'" )
+            if st.button("Get Answer") and question:
+                with st.spinner("Retrieving answer...🤖"):
+                    try:
+                        generator = AnswerGenerator()
+                        answer, supporting_chunks = generator.answer(parsed['chunks'], question)
+                        st.markdown(f"<div class='card'><h3>Answer</h3><p>{answer}</p></div>", unsafe_allow_html=True)
+                        st.markdown("<div class='card'><h4>Supporting Context</h4></div>", unsafe_allow_html=True)
+                        for sc in supporting_chunks:
+                            st.write(f"- {sc['narration']}")
+                    except Exception as e:
+                        st.error(f"Failed to generate answer: {e}")
+        except Exception as e:
+            st.error(f"Error in Q&A section: {e}")
     else:
         st.info("Upload and parse a document to ask questions.")
 with col3:
     st.header("3. Relevant Chunks")
     if parsed:
+        try:
+            chunks = parsed.get('chunks', [])
+            for idx, chunk in enumerate(chunks):
+                with st.expander(f"Chunk {idx} - {chunk['type'].capitalize()}"):
+                    try:
+                        st.write(chunk.get('narration', ''))
+                        if 'table_structure' in chunk:
+                            st.write("**Parsed Table:**")
+                            st.table(chunk['table_structure'])
+                        for blk in chunk.get('blocks', []):
+                            if blk.get('type') == 'img_path':
+                                img_path = os.path.join(parsed['images_dir'], blk.get('img_path',''))
+                                if os.path.exists(img_path):
+                                    st.image(img_path, caption=os.path.basename(img_path))
+                    except Exception as e:
+                        st.error(f"Error displaying chunk: {e}")
+            st.info(f"Total chunks: {len(chunks)}")
+        except Exception as e:
+            st.error(f"Error displaying chunks: {e}")
     else:
         st.info("No chunks to display. Parse a document first.")

requirements.txt CHANGED Viewed

@@ -4,9 +4,15 @@ mineru>=0.1.0
 sentence-transformers>=2.2.2
 rank-bm25>=0.2.2
 redis>=4.5.1
 transformers>=4.29.2
 torch>=2.0.0
 openai>=0.27.0
 huggingface-hub>=0.16.4
-# For testing
-pytest>=7.0

 sentence-transformers>=2.2.2
 rank-bm25>=0.2.2
 redis>=4.5.1
+hnswlib>=0.7.0
 transformers>=4.29.2
 torch>=2.0.0
 openai>=0.27.0
 huggingface-hub>=0.16.4
+langchain>=0.1.9
+python-dotenv>=1.0.0
+structlog>=23.1.0
+bleach>=6.0.0
+# Testing
+pytest>=7.0

src/__init__.py CHANGED Viewed

@@ -19,7 +19,8 @@ def configure_logging():
         wrapper_class=structlog.stdlib.BoundLogger,
         cache_logger_on_first_use=True,
     )
-    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 def get_env(name):
     val = os.getenv(name)

         wrapper_class=structlog.stdlib.BoundLogger,
         cache_logger_on_first_use=True,
     )
+    if not logging.getLogger().handlers:
+        logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 def get_env(name):
     val = os.getenv(name)

src/config.py CHANGED Viewed

@@ -15,7 +15,9 @@ class EmbeddingConfig:
     META_MODEL = os.getenv('META_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
 class RetrieverConfig:
-    TOP_K = int(os.getenv('RETRIEVER_TOP_K', 10))
 class RerankerConfig:
     MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')

     META_MODEL = os.getenv('META_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
 class RetrieverConfig:
+    TOP_K = int(os.getenv('RETRIEVER_TOP_K', 10))  # number of candidates per retrieval path
+    DENSE_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
+    ANN_TOP = int(os.getenv('ANN_TOP', 50))
 class RerankerConfig:
     MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')

src/gpp.py CHANGED Viewed

@@ -8,7 +8,7 @@ This module handles:
  4. Parsing markdown tables into JSON 2D structures for dense tables
  5. Narration of tables/images via LLM
  6. Semantic enhancements (deduplication, coreference, metadata summarization)
- 7. Embedding computation and storage in Redis & BM25
 Each step is modular to support swapping components (e.g. different parsers or stores).
 """
@@ -27,7 +27,6 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
 import numpy as np
-import redis
 # LLM client abstraction
 from src.utils import LLMClient
@@ -71,22 +70,12 @@ class GPPConfig:
     TEXT_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
     META_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
-    # Redis settings
-    REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
-    REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
-    REDIS_DB = int(os.getenv('REDIS_DB', 0))
-    REDIS_VECTOR_INDEX = 'gpp_vectors'
 class GPP:
     def __init__(self, config: GPPConfig):
         self.config = config
         # Embedding models
         self.text_embedder = SentenceTransformer(config.TEXT_EMBED_MODEL)
         self.meta_embedder = SentenceTransformer(config.META_EMBED_MODEL)
-        # Redis for vectors + metadata
-        self.redis = redis.Redis(host=config.REDIS_HOST,
-                                 port=config.REDIS_PORT,
-                                 db=config.REDIS_DB)
         self.bm25 = None
     def parse_pdf(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
@@ -179,42 +168,43 @@ class GPP:
                 c['narration'] = c['text']
     def deduplicate(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """
-        Drops near-duplicate narrations via cosine sim > threshold.
-        """
-        embs = self.text_embedder.encode([c['narration'] for c in chunks], convert_to_tensor=True)
-        keep = []
-        for i, emb in enumerate(embs):
-            if not any((emb @ embs[j]).item() / (np.linalg.norm(emb) * np.linalg.norm(embs[j]))
-                       > self.config.DEDUP_SIM_THRESHOLD for j in keep):
-                keep.append(i)
-        deduped = [chunks[i] for i in keep]
-        logger.info(f"Deduplicated: {len(chunks)}→{len(deduped)}")
-        return deduped
     def coref_resolution(self, chunks: List[Dict[str, Any]]) -> None:
-        """
-        Resolve pronouns using preceding context via LLM.
-        """
         for idx, c in enumerate(chunks):
             start = max(0, idx-self.config.COREF_CONTEXT_SIZE)
-            ctx = "\n".join(chunks[i]['narration'] for i in range(start, idx))
-            prompt = f"Context:\n{ctx}\nRewrite pronouns in:\n{c['narration']}"
-            c['narration'] = LLMClient.generate(prompt)
     def metadata_summarization(self, chunks: List[Dict[str, Any]]) -> None:
-        """
-        Summarize sections and attach to metadata for self-contained context.
-        """
         sections: Dict[str, List[Dict[str, Any]]] = {}
         for c in chunks:
             sec = c.get('section', 'default')
             sections.setdefault(sec, []).append(c)
         for sec, items in sections.items():
-            blob = "\n".join(i['narration'] for i in items)
-            summ = LLMClient.generate(f"Summarize this section:\n{blob}")
-            for i in items:
-                i.setdefault('metadata', {})['section_summary'] = summ
     def build_bm25(self, chunks: List[Dict[str, Any]]) -> None:
         """
@@ -223,31 +213,20 @@ class GPP:
         tokenized = [c['narration'].split() for c in chunks]
         self.bm25 = BM25Okapi(tokenized)
-    def compute_and_store(self, chunks: List[Dict[str, Any]]) -> None:
-        """
-        Encode narrations & metadata, store vectors and chunk metadata in Redis.
-        """
-        txts = [c['narration'] for c in chunks]
-        metas = [c.get('metadata', {}).get('section_summary', '') for c in chunks]
-        txt_embs = self.text_embedder.encode(txts)
-        meta_embs = self.meta_embedder.encode(metas)
-        pipe = self.redis.pipeline()
-        for i, (c, te) in enumerate(zip(chunks, txt_embs)):
-            key = f"chunk:{i}"
-            # store metadata
-            store = {'narration': c['narration'], 'type': c['type']}
-            if 'table_structure' in c:
-                store['table_structure'] = json.dumps(c['table_structure'])
-            pipe.hset(key, mapping=store)
-            # store dense vector
-            pipe.hset(self.config.REDIS_VECTOR_INDEX, key, te.tobytes())
-        pipe.execute()
-        logger.info("Stored embeddings and metadata in Redis.")
     def run(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
         """
-        Executes full GPP: parse → chunk → narrate → enhance → index.
         Returns parse output dict augmented with `chunks` for downstream processes.
         """
         parsed = self.parse_pdf(pdf_path, output_dir)
@@ -258,16 +237,7 @@ class GPP:
         self.coref_resolution(chunks)
         self.metadata_summarization(chunks)
         self.build_bm25(chunks)
-        self.compute_and_store(chunks)
         parsed['chunks'] = chunks
         logger.info("GPP pipeline complete.")
         return parsed
-if __name__ == '__main__':
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('pdf')
-    parser.add_argument('outdir')
-    args = parser.parse_args()
-    gpp = GPP(GPPConfig())
-    gpp.run(args.pdf, args.outdir)

  4. Parsing markdown tables into JSON 2D structures for dense tables
  5. Narration of tables/images via LLM
  6. Semantic enhancements (deduplication, coreference, metadata summarization)
+ 7. Embedding computation for in-memory use
 Each step is modular to support swapping components (e.g. different parsers or stores).
 """
 from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
 import numpy as np
 # LLM client abstraction
 from src.utils import LLMClient
     TEXT_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
     META_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
 class GPP:
     def __init__(self, config: GPPConfig):
         self.config = config
         # Embedding models
         self.text_embedder = SentenceTransformer(config.TEXT_EMBED_MODEL)
         self.meta_embedder = SentenceTransformer(config.META_EMBED_MODEL)
         self.bm25 = None
     def parse_pdf(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
                 c['narration'] = c['text']
     def deduplicate(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        try:
+            embs = self.text_embedder.encode([c.get('narration', '') for c in chunks], convert_to_tensor=True)
+            keep = []
+            for i, emb in enumerate(embs):
+                if not any((emb @ embs[j]).item() / (np.linalg.norm(emb) * np.linalg.norm(embs[j]) + 1e-8)
+                           > self.config.DEDUP_SIM_THRESHOLD for j in keep):
+                    keep.append(i)
+            deduped = [chunks[i] for i in keep]
+            logger.info(f"Deduplicated: {len(chunks)}→{len(deduped)}")
+            return deduped
+        except Exception as e:
+            logger.error(f"Deduplication failed: {e}")
+            return chunks
     def coref_resolution(self, chunks: List[Dict[str, Any]]) -> None:
         for idx, c in enumerate(chunks):
             start = max(0, idx-self.config.COREF_CONTEXT_SIZE)
+            ctx = "\n".join(chunks[i].get('narration', '') for i in range(start, idx))
+            prompt = f"Context:\n{ctx}\nRewrite pronouns in:\n{c.get('narration', '')}"
+            try:
+                c['narration'] = LLMClient.generate(prompt)
+            except Exception as e:
+                logger.error(f"Coref resolution failed for chunk {idx}: {e}")
     def metadata_summarization(self, chunks: List[Dict[str, Any]]) -> None:
         sections: Dict[str, List[Dict[str, Any]]] = {}
         for c in chunks:
             sec = c.get('section', 'default')
             sections.setdefault(sec, []).append(c)
         for sec, items in sections.items():
+            blob = "\n".join(i.get('narration', '') for i in items)
+            try:
+                summ = LLMClient.generate(f"Summarize this section:\n{blob}")
+                for i in items:
+                    i.setdefault('metadata', {})['section_summary'] = summ
+            except Exception as e:
+                logger.error(f"Metadata summarization failed for section {sec}: {e}")
     def build_bm25(self, chunks: List[Dict[str, Any]]) -> None:
         """
         tokenized = [c['narration'].split() for c in chunks]
         self.bm25 = BM25Okapi(tokenized)
+    # def compute_and_store(self, chunks: List[Dict[str, Any]]) -> None:
+    #     try:
+    #         txts = [c.get('narration', '') for c in chunks]
+    #         metas = [c.get('metadata', {}).get('section_summary', '') for c in chunks]
+    #         txt_embs = self.text_embedder.encode(txts)
+    #         meta_embs = self.meta_embedder.encode(metas)
+    #         # No Redis storage, just keep for in-memory use or return as needed
+    #         logger.info("Computed embeddings for chunks.")
+    #     except Exception as e:
+    #         logger.error(f"Failed to compute embeddings: {e}")
     def run(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
         """
+        Executes full GPP: parse -> chunk -> narrate -> enhance -> index.
         Returns parse output dict augmented with `chunks` for downstream processes.
         """
         parsed = self.parse_pdf(pdf_path, output_dir)
         self.coref_resolution(chunks)
         self.metadata_summarization(chunks)
         self.build_bm25(chunks)
+        # self.compute_and_store(chunks)
         parsed['chunks'] = chunks
         logger.info("GPP pipeline complete.")
         return parsed

src/qa.py CHANGED Viewed

@@ -25,7 +25,7 @@ from src.retriever import Retriever, RetrieverConfig
 class RerankerConfig:
-    MODEL_NAME = 'BAAI/bge-reranker-v2-Gemma'
     DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 class Reranker:
@@ -33,26 +33,36 @@ class Reranker:
     Cross-encoder re-ranker using a transformer-based sequence classification model.
     """
     def __init__(self, config: RerankerConfig):
-        self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
-        self.model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME)
-        self.model.to(config.DEVICE)
     def rerank(self, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]:
         """Score each candidate and return top_k sorted by relevance."""
-        inputs = self.tokenizer(
-            [query] * len(candidates),
-            [c['narration'] for c in candidates],
-            padding=True,
-            truncation=True,
-            return_tensors='pt'
-        ).to(RerankerConfig.DEVICE)
-        with torch.no_grad():
-            logits = self.model(**inputs).logits.squeeze(-1)
-            scores = torch.sigmoid(logits).cpu().numpy()
-        # pair and sort
-        paired = list(zip(candidates, scores))
-        ranked = sorted(paired, key=lambda x: x[1], reverse=True)
-        return [c for c, _ in ranked[:top_k]]
 class AnswerGenerator:
@@ -66,23 +76,23 @@ class AnswerGenerator:
     def answer(self, chunks: List[Dict[str, Any]], question: str) -> Tuple[str, List[Dict[str, Any]]]:
         logger.info('Answering question', question=question)
         question = sanitize_html(question)
-        # 1. Retrieval
-        retriever = Retriever(chunks, self.ret_config)
-        candidates = retriever.retrieve(question)
-        # 2. Re-ranking
-        reranker = Reranker(self.rerank_config)
-        top_chunks = reranker.rerank(question, candidates, top_k=5)
-        # 3. Assemble prompt
-        context = "\n\n".join([f"- {c['narration']}" for c in top_chunks])
-        prompt = (
-            f"You are a knowledgeable assistant. "
-            f"Use the following extracted document snippets to answer the question."
-            f"\n\nContext:\n{context}"
-            f"\n\nQuestion: {question}\nAnswer:"
-        )
-        # 4. Generate answer
-        answer = LLMClient.generate(prompt)
-        return answer, top_chunks
 # Example usage:
 # generator = AnswerGenerator()

 class RerankerConfig:
+    MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')
     DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 class Reranker:
     Cross-encoder re-ranker using a transformer-based sequence classification model.
     """
     def __init__(self, config: RerankerConfig):
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
+            self.model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME)
+            self.model.to(config.DEVICE)
+        except Exception as e:
+            logger.error(f'Failed to load reranker model: {e}')
+            raise
     def rerank(self, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]:
         """Score each candidate and return top_k sorted by relevance."""
+        if not candidates:
+            logger.warning('No candidates provided to rerank.')
+            return []
+        try:
+            inputs = self.tokenizer(
+                [query] * len(candidates),
+                [c.get('narration', '') for c in candidates],
+                padding=True,
+                truncation=True,
+                return_tensors='pt'
+            ).to(RerankerConfig.DEVICE)
+            with torch.no_grad():
+                logits = self.model(**inputs).logits.squeeze(-1)
+                scores = torch.sigmoid(logits).cpu().numpy()
+            paired = list(zip(candidates, scores))
+            ranked = sorted(paired, key=lambda x: x[1], reverse=True)
+            return [c for c, _ in ranked[:top_k]]
+        except Exception as e:
+            logger.error(f'Reranking failed: {e}')
+            return candidates[:top_k]
 class AnswerGenerator:
     def answer(self, chunks: List[Dict[str, Any]], question: str) -> Tuple[str, List[Dict[str, Any]]]:
         logger.info('Answering question', question=question)
         question = sanitize_html(question)
+        try:
+            retriever = Retriever(chunks, self.ret_config)
+            candidates = retriever.retrieve(question)
+            reranker = Reranker(self.rerank_config)
+            top_chunks = reranker.rerank(question, candidates, top_k=5)
+            context = "\n\n".join([f"- {c.get('narration', '')}" for c in top_chunks])
+            prompt = (
+                f"You are a knowledgeable assistant. "
+                f"Use the following extracted document snippets to answer the question."
+                f"\n\nContext:\n{context}"
+                f"\n\nQuestion: {question}\nAnswer:"
+            )
+            answer = LLMClient.generate(prompt)
+            return answer, top_chunks
+        except Exception as e:
+            logger.error(f'Failed to answer question: {e}')
+            return "Failed to generate answer due to error.", []
 # Example usage:
 # generator = AnswerGenerator()

src/retriever.py CHANGED Viewed

@@ -1,64 +1,107 @@
 import os
 import numpy as np
-import redis
 import hnswlib
 from typing import List, Dict, Any
 from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
-class RetrieverConfig:
-    TOP_K = 10  # number of candidates per retrieval path
-    DENSE_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
-    REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
-    REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
-    REDIS_DB = int(os.getenv('REDIS_DB', 0))
-    REDIS_VECTOR_INDEX = 'gpp_vectors'
 class Retriever:
     """
-    Hybrid retriever combining BM25 sparse and Redis-based dense retrieval.
     """
     def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
-        self.chunks = chunks
-        # Build BM25 index over chunk narrations
-        corpus = [c['narration'].split() for c in chunks]
-        self.bm25 = BM25Okapi(corpus)
-        # Load dense embedder
-        self.embedder = SentenceTransformer(config.DENSE_MODEL)
-        # Connect to Redis for vector store
-        self.redis = redis.Redis(host=config.REDIS_HOST,
-                                 port=config.REDIS_PORT,
-                                 db=config.REDIS_DB)
-        self.vector_index = config.REDIS_VECTOR_INDEX
-        # Build HNSW index
-        dim = len(self.embedder.encode(["test"])[0])
-        self.ann = hnswlib.Index(space='cosine', dim=dim)
-        self.ann.init_index(max_elements=len(chunks), ef_construction=200, M=16)
-        embeddings = self.embedder.encode([c['narration'] for c in chunks])
-        self.ann.add_items(embeddings, ids=list(range(len(chunks))))
-        self.ann.set_ef(50)  # ef should be > top_k for accuracy
     def retrieve_sparse(self, query: str, top_k: int) -> List[Dict[str, Any]]:
-        """Return top_k chunks by BM25 score."""
         tokenized = query.split()
-        scores = self.bm25.get_scores(tokenized)
-        top_indices = np.argsort(scores)[::-1][:top_k]
-        return [self.chunks[i] for i in top_indices]
     def retrieve_dense(self, query: str, top_k: int) -> List[Dict[str, Any]]:
-        """Return top_k chunks by dense cosine similarity via Redis vectors."""
-        # Embed query
-        q_emb = self.embedder.encode([query])[0]
-        labels, distances = self.ann.knn_query(q_emb, k=top_k)
-        return [self.chunks[i] for i in labels[0]]
-    def retrieve(self, query: str, top_k: int = RetrieverConfig.TOP_K) -> List[Dict[str, Any]]:
-        """Combine sparse + dense results (unique) into candidate pool."""
         sparse = self.retrieve_sparse(query, top_k)
         dense = self.retrieve_dense(query, top_k)
-        # Union while preserving order
         seen = set()
         combined = []
         for c in sparse + dense:

 import os
 import numpy as np
 import hnswlib
 from typing import List, Dict, Any
 from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
+from src.config import RetrieverConfig
+from src import logger
 class Retriever:
     """
+    Hybrid retriever combining BM25 sparse and dense retrieval (no Redis).
     """
     def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
+        """
+        Initialize the retriever with chunks and configuration.
+        Args:
+        chunks (List[Dict[str, Any]]): List of chunks, where each chunk is a dictionary.
+        config (RetrieverConfig): Configuration for the retriever.
+        """
+        self.chunks = chunks
+        try:
+            if not isinstance(chunks, list) or not all(isinstance(c, dict) for c in chunks):
+                logger.error("Chunks must be a list of dicts.")
+                raise ValueError("Chunks must be a list of dicts.")
+            corpus = [c.get('narration', '').split() for c in chunks]
+            self.bm25 = BM25Okapi(corpus)
+            self.embedder = SentenceTransformer(config.DENSE_MODEL)
+            dim = len(self.embedder.encode(["test"])[0])
+            self.ann = hnswlib.Index(space='cosine', dim=dim)
+            self.ann.init_index(max_elements=len(chunks))
+            embeddings = self.embedder.encode([c.get('narration', '') for c in chunks])
+            self.ann.add_items(embeddings, ids=list(range(len(chunks))))
+            self.ann.set_ef(config.ANN_TOP)
+        except Exception as e:
+            logger.error(f"Retriever init failed: {e}")
+            self.bm25 = None
+            self.embedder = None
+            self.ann = None
     def retrieve_sparse(self, query: str, top_k: int) -> List[Dict[str, Any]]:
+        """
+        Retrieve chunks using BM25 sparse retrieval.
+        Args:
+        query (str): Query string.
+        top_k (int): Number of top chunks to return.
+        Returns:
+        List[Dict[str, Any]]: List of top chunks.
+        """
+        if not self.bm25:
+            logger.error("BM25 not initialized.")
+            return []
         tokenized = query.split()
+        try:
+            scores = self.bm25.get_scores(tokenized)
+            top_indices = np.argsort(scores)[::-1][:top_k]
+            return [self.chunks[i] for i in top_indices]
+        except Exception as e:
+            logger.error(f"Sparse retrieval failed: {e}")
+            return []
     def retrieve_dense(self, query: str, top_k: int) -> List[Dict[str, Any]]:
+        """
+        Retrieve chunks using dense retrieval.
+        Args:
+        query (str): Query string.
+        top_k (int): Number of top chunks to return.
+        Returns:
+        List[Dict[str, Any]]: List of top chunks.
+        """
+        if not self.ann or not self.embedder:
+            logger.error("Dense retriever not initialized.")
+            return []
+        try:
+            q_emb = self.embedder.encode([query])[0]
+            labels, distances = self.ann.knn_query(q_emb, k=top_k)
+            return [self.chunks[i] for i in labels[0]]
+        except Exception as e:
+            logger.error(f"Dense retrieval failed: {e}")
+            return []
+    def retrieve(self, query: str, top_k: int = None) -> List[Dict[str, Any]]:
+        """
+        Retrieve chunks using hybrid retrieval.
+        Args:
+        query (str): Query string.
+        top_k (int, optional): Number of top chunks to return. Defaults to None.
+        Returns:
+        List[Dict[str, Any]]: List of top chunks.
+        """
+        if top_k is None:
+            top_k = RetrieverConfig.TOP_K
         sparse = self.retrieve_sparse(query, top_k)
         dense = self.retrieve_dense(query, top_k)
         seen = set()
         combined = []
         for c in sparse + dense:

src/utils.py CHANGED Viewed

@@ -3,27 +3,13 @@ Utilities module: LLM client wrapper and shared helpers.
 """
 import os
 import openai
-import logging
-import sys
-import structlog
-def configure_logging():
-    structlog.configure(
-        processors=[
-            structlog.processors.TimeStamper(fmt="iso"),
-            structlog.processors.JSONRenderer()
-        ],
-        context_class=dict,
-        logger_factory=structlog.stdlib.LoggerFactory(),
-        wrapper_class=structlog.stdlib.BoundLogger,
-        cache_logger_on_first_use=True,
-    )
-    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-configure_logging()
-logger = structlog.get_logger()
 class LLMClient:
     """
@@ -32,15 +18,22 @@ class LLMClient:
     """
     @staticmethod
     def generate(prompt: str, model: str = None, max_tokens: int = 512, **kwargs) -> str:
-        api_key = os.getenv('OPENAI_API_KEY')
-        if not api_key:
             logger.error('OPENAI_API_KEY is not set')
             raise EnvironmentError('Missing OPENAI_API_KEY')
-        openai.api_key = api_key
-        model_name = model or os.getenv('OPENAI_MODEL', 'gpt-4o')
         try:
-            resp = openai.ChatCompletion.create(
-                model=model_name,
                 messages=[{"role": "system", "content": "You are a helpful assistant."},
                           {"role": "user", "content": prompt}],
                 max_tokens=max_tokens,
@@ -49,7 +42,9 @@ class LLMClient:
             )
             text = resp.choices[0].message.content.strip()
             return text
         except Exception as e:
             logger.exception('LLM generation failed')
             raise

 """
 import os
 import openai
+from openai import AzureOpenAI, error
+try:
+    from src.utils import logger
+except ImportError:
+    import structlog
+    logger = structlog.get_logger()
 class LLMClient:
     """
     """
     @staticmethod
     def generate(prompt: str, model: str = None, max_tokens: int = 512, **kwargs) -> str:
+        azure_api_key = os.getenv('AZURE_API_KEY')
+        azure_endpoint = os.getenv('AZURE_ENDPOINT')
+        azure_api_version = os.getenv('AZURE_API_VERSION')
+        openai_model_name = model or os.getenv('OPENAI_MODEL', 'gpt-4o')
+        if not (azure_api_key or azure_endpoint or azure_api_version or openai_model_name):
             logger.error('OPENAI_API_KEY is not set')
             raise EnvironmentError('Missing OPENAI_API_KEY')
+        client = AzureOpenAI(
+                api_key=azure_api_key,
+                azure_endpoint=azure_endpoint,
+                api_version=azure_api_version
+            )
         try:
+            resp = client.ChatCompletion.create(
+                model=openai_model_name,
                 messages=[{"role": "system", "content": "You are a helpful assistant."},
                           {"role": "user", "content": prompt}],
                 max_tokens=max_tokens,
             )
             text = resp.choices[0].message.content.strip()
             return text
+        except openai.error.OpenAIError as oe:
+            logger.error(f'OpenAI API error: {oe}')
+            raise
         except Exception as e:
             logger.exception('LLM generation failed')
             raise