PDF_Recogni

Build error

App Files Files Community

Abhinav Gavireddi commited on Apr 26

Commit

3301b3c

1 Parent(s): e2fc494

initial commit

Browse files

Files changed (11) hide show

.github/workflows/ci.yaml +36 -0
Dockerfile +35 -0
requirements.txt +12 -0
src/__init__.py +35 -0
src/app.py +120 -0
src/config.py +30 -0
src/gpp.py +273 -0
src/qa.py +89 -0
src/retriever.py +69 -0
src/utils.py +55 -0
tests/test.py +117 -0

.github/workflows/ci.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+name: CI & Deploy
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+jobs:
+  build-and-test:
+    # … your existing test setup …
+  deploy-to-hf:
+    needs: build-and-test
+    runs-on: ubuntu-latest
+    if: github.ref == 'refs/heads/main'  # only on main branch
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0            # needed to push full history
+      - name: Set up Docker credentials
+        run: echo "${{ secrets.HF_TOKEN }}" | docker login --username ${{ secrets.HF_USERNAME }} --password-stdin docker.pkg.github.com
+      - name: Install Hugging Face CLI
+        run: pip install huggingface_hub
+      - name: Log in to Hugging Face
+        run: |
+          huggingface-cli login --token ${{ secrets.HF_TOKEN }}
+      - name: Push to Hugging Face Space
+        run: |
+          git remote add hf https://huggingface.co/spaces/${{ secrets.HF_USERNAME }}/${{ secrets.HF_SPACE_NAME }}.git
+          git push hf main --force

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+# Base image
+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# System dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    ffmpeg \
+    libgomp1 \         # for hnswlib (needed for OpenMP)
+    && rm -rf /var/lib/apt/lists/*
+# Copy and install Python dependencies
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY src/ ./src/
+COPY tests/ ./tests/
+COPY app.py .
+# Copy env file if you want local dev (optional)
+# COPY .env .env
+# Expose Streamlit port
+EXPOSE 8501
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV TOKENIZERS_PARALLELISM=false
+# Start Streamlit
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+# Core
+streamlit>=1.25.0
+mineru>=0.1.0
+sentence-transformers>=2.2.2
+rank-bm25>=0.2.2
+redis>=4.5.1
+transformers>=4.29.2
+torch>=2.0.0
+openai>=0.27.0
+huggingface-hub>=0.16.4
+# For testing
+pytest>=7.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+from dotenv import load_dotenv
+import bleach
+import logging
+import sys
+import structlog
+load_dotenv()
+def configure_logging():
+    structlog.configure(
+        processors=[
+            structlog.processors.TimeStamper(fmt="iso"),
+            structlog.processors.JSONRenderer()
+        ],
+        context_class=dict,
+        logger_factory=structlog.stdlib.LoggerFactory(),
+        wrapper_class=structlog.stdlib.BoundLogger,
+        cache_logger_on_first_use=True,
+    )
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+def get_env(name):
+    val = os.getenv(name)
+    if not val:
+        raise RuntimeError(f"Missing required secret: {name}")
+    return val
+def sanitize_html(raw):
+    # allow only text and basic tags
+    return bleach.clean(raw, tags=[], strip=True)
+configure_logging()
+logger = structlog.get_logger()

src/app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+import streamlit as st
+from datetime import datetime
+from src.gpp import GPP, GPPConfig
+from src.qa import AnswerGenerator
+# --- Custom CSS for styling ---
+st.markdown(
+    """
+    <style>
+    body { background-color: #F5F7FA; }
+    .header { text-align: center; padding: 10px; }
+    .card { background: white; border-radius: 10px; padding: 15px; margin-bottom: 20px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
+    .stButton>button { background-color: #4A90E2; color: white; }
+    pre { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
+    </style>
+    """, unsafe_allow_html=True
+)
+# --- Page Configuration ---
+st.set_page_config(
+    page_title="Document Intelligence Q&A",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# --- Header ---
+st.markdown("<div class='header'>", unsafe_allow_html=True)
+st.image("https://img.icons8.com/ios-filled/50/4A90E2/document.png", width=50)
+st.title("Document Intelligence Q&A")
+st.markdown(
+    "<p style='font-size:18px; color:#555;'>Upload any PDF and get instant insights via advanced RAG-powered Q&A.</p>",
+    unsafe_allow_html=True
+)
+st.markdown(
+    f"<p style='font-size:12px; color:#888;'>Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>",
+    unsafe_allow_html=True
+)
+st.markdown("</div>", unsafe_allow_html=True)
+# --- Sidebar: Instructions ---
+with st.sidebar:
+    st.header("How It Works")
+    st.markdown(
+        "1. Upload and parse your PDF; 2. LLM narrates tables/images and enriches context; 3. Hybrid retrieval surfaces relevant chunks; 4. Reranker refines and generates answer."
+    )
+    st.markdown("---")
+    st.markdown("&copy; 2025 Document Intelligence Team")
+# --- Session State ---
+if "parsed" not in st.session_state:
+    st.session_state.parsed = None
+# --- Three-Column Layout ---
+col1, col2, col3 = st.columns([2, 3, 3])
+# --- Left Column: Upload & Layout ---
+with col1:
+    st.header("1. Upload & Layout")
+    uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
+    if uploaded_file:
+        if st.button("Parse Document"):
+            output_dir = os.path.join("./parsed", uploaded_file.name)
+            os.makedirs(output_dir, exist_ok=True)
+            pdf_path = os.path.join(output_dir, uploaded_file.name)
+            with open(pdf_path, "wb") as f:
+                f.write(uploaded_file.getbuffer())
+            with st.spinner("Parsing document with MinerU and LLM...⏳"):
+                gpp = GPP(GPPConfig())
+                parsed = gpp.run(pdf_path, output_dir)
+            st.success("✅ Parsing complete!")
+            st.session_state.parsed = parsed
+    parsed = st.session_state.parsed
+    if parsed:
+        st.subheader("Layout Preview")
+        layout_pdf = parsed.get("layout_pdf")
+        if layout_pdf and os.path.exists(layout_pdf):
+            st.markdown(f"[Open Layout PDF]({layout_pdf})")
+        st.subheader("Extracted Content (Preview)")
+        md_path = parsed.get("md_path")
+        if md_path and os.path.exists(md_path):
+            md_text = open(md_path, 'r', encoding='utf-8').read()
+            st.markdown(f"<div class='card'><pre>{md_text[:2000]}{'...' if len(md_text)>2000 else ''}</pre></div>", unsafe_allow_html=True)
+# --- Center Column: Q&A ---
+with col2:
+    st.header("2. Ask a Question")
+    if parsed:
+        question = st.text_input("Type your question here:", placeholder="E.g., 'What was the Q2 revenue?'" )
+        if st.button("Get Answer") and question:
+            with st.spinner("Retrieving answer...🤖"):
+                generator = AnswerGenerator()
+                answer, supporting_chunks = generator.answer(parsed['chunks'], question)
+            st.markdown(f"<div class='card'><h3>Answer</h3><p>{answer}</p></div>", unsafe_allow_html=True)
+            st.markdown("<div class='card'><h4>Supporting Context</h4></div>", unsafe_allow_html=True)
+            for sc in supporting_chunks:
+                st.write(f"- {sc['narration']}")
+    else:
+        st.info("Upload and parse a document to ask questions.")
+# --- Right Column: Chunks ---
+with col3:
+    st.header("3. Relevant Chunks")
+    if parsed:
+        chunks = parsed.get('chunks', [])
+        for idx, chunk in enumerate(chunks):
+            with st.expander(f"Chunk {idx} - {chunk['type'].capitalize()}"):
+                st.write(chunk.get('narration', ''))
+                if 'table_structure' in chunk:
+                    st.write("**Parsed Table:**")
+                    st.table(chunk['table_structure'])
+                for blk in chunk.get('blocks', []):
+                    if blk.get('type') == 'img_path':
+                        img_path = os.path.join(parsed['images_dir'], blk.get('img_path',''))
+                        if os.path.exists(img_path):
+                            st.image(img_path, caption=os.path.basename(img_path))
+        st.info(f"Total chunks: {len(chunks)}")
+    else:
+        st.info("No chunks to display. Parse a document first.")

src/config.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+Central configuration for the entire Document Intelligence app.
+All modules import from here rather than hard-coding values.
+"""
+import os
+class RedisConfig:
+    HOST = os.getenv('REDIS_HOST', 'localhost')
+    PORT = int(os.getenv('REDIS_PORT', 6379))
+    DB = int(os.getenv('REDIS_DB', 0))
+    VECTOR_INDEX = os.getenv('REDIS_VECTOR_INDEX', 'gpp_vectors')
+class EmbeddingConfig:
+    TEXT_MODEL = os.getenv('TEXT_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
+    META_MODEL = os.getenv('META_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
+class RetrieverConfig:
+    TOP_K = int(os.getenv('RETRIEVER_TOP_K', 10))
+class RerankerConfig:
+    MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')
+    DEVICE = os.getenv('RERANKER_DEVICE', 'cuda' if os.getenv('CUDA_VISIBLE_DEVICES') else 'cpu')
+class GPPConfig:
+    CHUNK_TOKEN_SIZE = int(os.getenv('CHUNK_TOKEN_SIZE', 256))
+    DEDUP_SIM_THRESHOLD = float(os.getenv('DEDUP_SIM_THRESHOLD', 0.9))
+    EXPANSION_SIM_THRESHOLD = float(os.getenv('EXPANSION_SIM_THRESHOLD', 0.85))
+    COREF_CONTEXT_SIZE = int(os.getenv('COREF_CONTEXT_SIZE', 3))
+# Add other configs (e.g. Streamlit settings, CI flags) as needed.

src/gpp.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""
+Generic Pre-Processing Pipeline (GPP) for Document Intelligence
+This module handles:
+ 1. Parsing PDFs via MinerU Python API (OCR/text modes)
+ 2. Extracting markdown, images, and content_list JSON
+ 3. Chunking multimodal content (text, tables, images), ensuring tables/images are in single chunks
+ 4. Parsing markdown tables into JSON 2D structures for dense tables
+ 5. Narration of tables/images via LLM
+ 6. Semantic enhancements (deduplication, coreference, metadata summarization)
+ 7. Embedding computation and storage in Redis & BM25
+Each step is modular to support swapping components (e.g. different parsers or stores).
+"""
+import os
+import json
+import logging
+from typing import List, Dict, Any, Optional
+import re
+from mineru.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+from mineru.data.dataset import PymuDocDataset
+from mineru.model.doc_analyze_by_custom_model import doc_analyze
+from mineru.config.enums import SupportedPdfParseMethod
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+from rank_bm25 import BM25Okapi
+import numpy as np
+import redis
+# LLM client abstraction
+from src.utils import LLMClient
+# Configure logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def parse_markdown_table(md: str) -> Optional[Dict[str, Any]]:
+    """
+    Parses a markdown table into a JSON-like dict:
+    { headers: [...], rows: [[...], ...] }
+    Handles multi-level headers by nesting lists if needed.
+    """
+    lines = [l for l in md.strip().splitlines() if l.strip().startswith('|')]
+    if len(lines) < 2:
+        return None
+    header_line = lines[0]
+    sep_line = lines[1]
+    # Validate separator line
+    if not re.match(r"^\|?\s*:?-+:?\s*(\|\s*:?-+:?\s*)+\|?", sep_line):
+        return None
+    def split_row(line):
+        parts = [cell.strip() for cell in line.strip().strip('|').split('|')]
+        return parts
+    headers = split_row(header_line)
+    rows = [split_row(r) for r in lines[2:]]
+    return {'headers': headers, 'rows': rows}
+class GPPConfig:
+    """
+    Configuration for GPP pipeline.
+    """
+    CHUNK_TOKEN_SIZE = 256
+    DEDUP_SIM_THRESHOLD = 0.9
+    EXPANSION_SIM_THRESHOLD = 0.85
+    COREF_CONTEXT_SIZE = 3
+    # Embedding models
+    TEXT_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
+    META_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
+    # Redis settings
+    REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
+    REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
+    REDIS_DB = int(os.getenv('REDIS_DB', 0))
+    REDIS_VECTOR_INDEX = 'gpp_vectors'
+class GPP:
+    def __init__(self, config: GPPConfig):
+        self.config = config
+        # Embedding models
+        self.text_embedder = SentenceTransformer(config.TEXT_EMBED_MODEL)
+        self.meta_embedder = SentenceTransformer(config.META_EMBED_MODEL)
+        # Redis for vectors + metadata
+        self.redis = redis.Redis(host=config.REDIS_HOST,
+                                 port=config.REDIS_PORT,
+                                 db=config.REDIS_DB)
+        self.bm25 = None
+    def parse_pdf(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
+        """
+        Uses MinerU API to parse PDF in OCR/text mode,
+        dumps markdown, images, layout PDF, content_list JSON.
+        Returns parsed data plus file paths for UI traceability.
+        """
+        name = os.path.splitext(os.path.basename(pdf_path))[0]
+        img_dir = os.path.join(output_dir, 'images')
+        os.makedirs(img_dir, exist_ok=True)
+        os.makedirs(output_dir, exist_ok=True)
+        writer_imgs = FileBasedDataWriter(img_dir)
+        writer_md = FileBasedDataWriter(output_dir)
+        reader = FileBasedDataReader("")
+        pdf_bytes = reader.read(pdf_path)
+        ds = PymuDocDataset(pdf_bytes)
+        if ds.classify() == SupportedPdfParseMethod.OCR:
+            infer = ds.apply(doc_analyze, ocr=True)
+            pipe = infer.pipe_ocr_mode(writer_imgs)
+        else:
+            infer = ds.apply(doc_analyze, ocr=False)
+            pipe = infer.pipe_txt_mode(writer_imgs)
+        # Visual layout
+        pipe.draw_layout(os.path.join(output_dir, f"{name}_layout.pdf"))
+        # Dump markdown & JSON
+        pipe.dump_md(writer_md, f"{name}.md", os.path.basename(img_dir))
+        pipe.dump_content_list(writer_md, f"{name}_content_list.json", os.path.basename(img_dir))
+        content_list_path = os.path.join(output_dir, f"{name}_content_list.json")
+        with open(content_list_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        # UI traceability paths
+        data.update({
+            'md_path': os.path.join(output_dir, f"{name}.md"),
+            'images_dir': img_dir,
+            'layout_pdf': os.path.join(output_dir, f"{name}_layout.pdf")
+        })
+        return data
+    def chunk_blocks(self, blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Creates chunks of ~CHUNK_TOKEN_SIZE tokens, but ensures any table/image block
+        becomes its own chunk (unsplittable), flushing current text chunk as needed.
+        """
+        chunks, current, token_count = [], {'text': '', 'type': None, 'blocks': []}, 0
+        for blk in blocks:
+            btype = blk.get('type')
+            text = blk.get('text', '')
+            if btype in ('table', 'img_path'):
+                # Flush existing text chunk
+                if current['blocks']:
+                    chunks.append(current)
+                    current = {'text': '', 'type': None, 'blocks': []}
+                    token_count = 0
+                # Create isolated chunk for the table/image
+                tbl_chunk = {'text': text, 'type': btype, 'blocks': [blk]}
+                # Parse markdown table into JSON structure if applicable
+                if btype == 'table':
+                    tbl_struct = parse_markdown_table(text)
+                    tbl_chunk['table_structure'] = tbl_struct
+                chunks.append(tbl_chunk)
+                continue
+            # Standard text accumulation
+            count = len(text.split())
+            if token_count + count > self.config.CHUNK_TOKEN_SIZE and current['blocks']:
+                chunks.append(current)
+                current = {'text': '', 'type': None, 'blocks': []}
+                token_count = 0
+            current['text'] += text + '\n'
+            current['type'] = current['type'] or btype
+            current['blocks'].append(blk)
+            token_count += count
+        # Flush remaining
+        if current['blocks']:
+            chunks.append(current)
+        logger.info(f"Chunked into {len(chunks)} pieces (with tables/images isolated).")
+        return chunks
+    def narrate_multimodal(self, chunks: List[Dict[str, Any]]) -> None:
+        """
+        For table/image chunks, generate LLM narration. Preserve table_structure in metadata.
+        """
+        for c in chunks:
+            if c['type'] in ('table', 'img_path'):
+                prompt = f"Describe this {c['type']} concisely:\n{c['text']}"
+                c['narration'] = LLMClient.generate(prompt)
+            else:
+                c['narration'] = c['text']
+    def deduplicate(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Drops near-duplicate narrations via cosine sim > threshold.
+        """
+        embs = self.text_embedder.encode([c['narration'] for c in chunks], convert_to_tensor=True)
+        keep = []
+        for i, emb in enumerate(embs):
+            if not any((emb @ embs[j]).item() / (np.linalg.norm(emb) * np.linalg.norm(embs[j]))
+                       > self.config.DEDUP_SIM_THRESHOLD for j in keep):
+                keep.append(i)
+        deduped = [chunks[i] for i in keep]
+        logger.info(f"Deduplicated: {len(chunks)}→{len(deduped)}")
+        return deduped
+    def coref_resolution(self, chunks: List[Dict[str, Any]]) -> None:
+        """
+        Resolve pronouns using preceding context via LLM.
+        """
+        for idx, c in enumerate(chunks):
+            start = max(0, idx-self.config.COREF_CONTEXT_SIZE)
+            ctx = "\n".join(chunks[i]['narration'] for i in range(start, idx))
+            prompt = f"Context:\n{ctx}\nRewrite pronouns in:\n{c['narration']}"
+            c['narration'] = LLMClient.generate(prompt)
+    def metadata_summarization(self, chunks: List[Dict[str, Any]]) -> None:
+        """
+        Summarize sections and attach to metadata for self-contained context.
+        """
+        sections: Dict[str, List[Dict[str, Any]]] = {}
+        for c in chunks:
+            sec = c.get('section', 'default')
+            sections.setdefault(sec, []).append(c)
+        for sec, items in sections.items():
+            blob = "\n".join(i['narration'] for i in items)
+            summ = LLMClient.generate(f"Summarize this section:\n{blob}")
+            for i in items:
+                i.setdefault('metadata', {})['section_summary'] = summ
+    def build_bm25(self, chunks: List[Dict[str, Any]]) -> None:
+        """
+        Build BM25 index on token lists for sparse retrieval.
+        """
+        tokenized = [c['narration'].split() for c in chunks]
+        self.bm25 = BM25Okapi(tokenized)
+    def compute_and_store(self, chunks: List[Dict[str, Any]]) -> None:
+        """
+        Encode narrations & metadata, store vectors and chunk metadata in Redis.
+        """
+        txts = [c['narration'] for c in chunks]
+        metas = [c.get('metadata', {}).get('section_summary', '') for c in chunks]
+        txt_embs = self.text_embedder.encode(txts)
+        meta_embs = self.meta_embedder.encode(metas)
+        pipe = self.redis.pipeline()
+        for i, (c, te) in enumerate(zip(chunks, txt_embs)):
+            key = f"chunk:{i}"
+            # store metadata
+            store = {'narration': c['narration'], 'type': c['type']}
+            if 'table_structure' in c:
+                store['table_structure'] = json.dumps(c['table_structure'])
+            pipe.hset(key, mapping=store)
+            # store dense vector
+            pipe.hset(self.config.REDIS_VECTOR_INDEX, key, te.tobytes())
+        pipe.execute()
+        logger.info("Stored embeddings and metadata in Redis.")
+    def run(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
+        """
+        Executes full GPP: parse → chunk → narrate → enhance → index.
+        Returns parse output dict augmented with `chunks` for downstream processes.
+        """
+        parsed = self.parse_pdf(pdf_path, output_dir)
+        blocks = parsed.get('blocks', [])
+        chunks = self.chunk_blocks(blocks)
+        self.narrate_multimodal(chunks)
+        chunks = self.deduplicate(chunks)
+        self.coref_resolution(chunks)
+        self.metadata_summarization(chunks)
+        self.build_bm25(chunks)
+        self.compute_and_store(chunks)
+        parsed['chunks'] = chunks
+        logger.info("GPP pipeline complete.")
+        return parsed
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('pdf')
+    parser.add_argument('outdir')
+    args = parser.parse_args()
+    gpp = GPP(GPPConfig())
+    gpp.run(args.pdf, args.outdir)

src/qa.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""
+AnswerGenerator: orchestrates retrieval, re-ranking, and answer generation.
+This module contains:
+ - Retriever: Hybrid BM25 + dense retrieval over parsed chunks
+ - Reranker: Cross-encoder based re-ranking of candidate chunks
+ - AnswerGenerator: ties together retrieval, re-ranking, and LLM generation
+Each component is modular and can be swapped or extended (e.g., add HyDE retriever).
+"""
+import os
+import json
+import numpy as np
+import redis
+from typing import List, Dict, Any, Tuple
+from sentence_transformers import SentenceTransformer
+from rank_bm25 import BM25Okapi
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+from src import sanitize_html
+from src.utils import LLMClient, logger
+from src.retriever import Retriever, RetrieverConfig
+class RerankerConfig:
+    MODEL_NAME = 'BAAI/bge-reranker-v2-Gemma'
+    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+class Reranker:
+    """
+    Cross-encoder re-ranker using a transformer-based sequence classification model.
+    """
+    def __init__(self, config: RerankerConfig):
+        self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
+        self.model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME)
+        self.model.to(config.DEVICE)
+    def rerank(self, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]:
+        """Score each candidate and return top_k sorted by relevance."""
+        inputs = self.tokenizer(
+            [query] * len(candidates),
+            [c['narration'] for c in candidates],
+            padding=True,
+            truncation=True,
+            return_tensors='pt'
+        ).to(RerankerConfig.DEVICE)
+        with torch.no_grad():
+            logits = self.model(**inputs).logits.squeeze(-1)
+            scores = torch.sigmoid(logits).cpu().numpy()
+        # pair and sort
+        paired = list(zip(candidates, scores))
+        ranked = sorted(paired, key=lambda x: x[1], reverse=True)
+        return [c for c, _ in ranked[:top_k]]
+class AnswerGenerator:
+    """
+    Main interface: given parsed chunks and a question, returns answer and supporting chunks.
+    """
+    def __init__(self):
+        self.ret_config = RetrieverConfig()
+        self.rerank_config = RerankerConfig()
+    def answer(self, chunks: List[Dict[str, Any]], question: str) -> Tuple[str, List[Dict[str, Any]]]:
+        logger.info('Answering question', question=question)
+        question = sanitize_html(question)
+        # 1. Retrieval
+        retriever = Retriever(chunks, self.ret_config)
+        candidates = retriever.retrieve(question)
+        # 2. Re-ranking
+        reranker = Reranker(self.rerank_config)
+        top_chunks = reranker.rerank(question, candidates, top_k=5)
+        # 3. Assemble prompt
+        context = "\n\n".join([f"- {c['narration']}" for c in top_chunks])
+        prompt = (
+            f"You are a knowledgeable assistant. "
+            f"Use the following extracted document snippets to answer the question."
+            f"\n\nContext:\n{context}"
+            f"\n\nQuestion: {question}\nAnswer:"
+        )
+        # 4. Generate answer
+        answer = LLMClient.generate(prompt)
+        return answer, top_chunks
+# Example usage:
+# generator = AnswerGenerator()
+# ans, ctx = generator.answer(parsed_chunks, "What was the Q2 revenue?")

src/retriever.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+import numpy as np
+import redis
+import hnswlib
+from typing import List, Dict, Any
+from sentence_transformers import SentenceTransformer
+from rank_bm25 import BM25Okapi
+class RetrieverConfig:
+    TOP_K = 10  # number of candidates per retrieval path
+    DENSE_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
+    REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
+    REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
+    REDIS_DB = int(os.getenv('REDIS_DB', 0))
+    REDIS_VECTOR_INDEX = 'gpp_vectors'
+class Retriever:
+    """
+    Hybrid retriever combining BM25 sparse and Redis-based dense retrieval.
+    """
+    def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
+        self.chunks = chunks
+        # Build BM25 index over chunk narrations
+        corpus = [c['narration'].split() for c in chunks]
+        self.bm25 = BM25Okapi(corpus)
+        # Load dense embedder
+        self.embedder = SentenceTransformer(config.DENSE_MODEL)
+        # Connect to Redis for vector store
+        self.redis = redis.Redis(host=config.REDIS_HOST,
+                                 port=config.REDIS_PORT,
+                                 db=config.REDIS_DB)
+        self.vector_index = config.REDIS_VECTOR_INDEX
+        # Build HNSW index
+        dim = len(self.embedder.encode(["test"])[0])
+        self.ann = hnswlib.Index(space='cosine', dim=dim)
+        self.ann.init_index(max_elements=len(chunks), ef_construction=200, M=16)
+        embeddings = self.embedder.encode([c['narration'] for c in chunks])
+        self.ann.add_items(embeddings, ids=list(range(len(chunks))))
+        self.ann.set_ef(50)  # ef should be > top_k for accuracy
+    def retrieve_sparse(self, query: str, top_k: int) -> List[Dict[str, Any]]:
+        """Return top_k chunks by BM25 score."""
+        tokenized = query.split()
+        scores = self.bm25.get_scores(tokenized)
+        top_indices = np.argsort(scores)[::-1][:top_k]
+        return [self.chunks[i] for i in top_indices]
+    def retrieve_dense(self, query: str, top_k: int) -> List[Dict[str, Any]]:
+        """Return top_k chunks by dense cosine similarity via Redis vectors."""
+        # Embed query
+        q_emb = self.embedder.encode([query])[0]
+        labels, distances = self.ann.knn_query(q_emb, k=top_k)
+        return [self.chunks[i] for i in labels[0]]
+    def retrieve(self, query: str, top_k: int = RetrieverConfig.TOP_K) -> List[Dict[str, Any]]:
+        """Combine sparse + dense results (unique) into candidate pool."""
+        sparse = self.retrieve_sparse(query, top_k)
+        dense = self.retrieve_dense(query, top_k)
+        # Union while preserving order
+        seen = set()
+        combined = []
+        for c in sparse + dense:
+            cid = id(c)
+            if cid not in seen:
+                seen.add(cid)
+                combined.append(c)
+        return combined

src/utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+Utilities module: LLM client wrapper and shared helpers.
+"""
+import os
+import openai
+import logging
+import sys
+import structlog
+def configure_logging():
+    structlog.configure(
+        processors=[
+            structlog.processors.TimeStamper(fmt="iso"),
+            structlog.processors.JSONRenderer()
+        ],
+        context_class=dict,
+        logger_factory=structlog.stdlib.LoggerFactory(),
+        wrapper_class=structlog.stdlib.BoundLogger,
+        cache_logger_on_first_use=True,
+    )
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+configure_logging()
+logger = structlog.get_logger()
+class LLMClient:
+    """
+    Simple wrapper around OpenAI (or any other) LLM API.
+    Reads API key from environment and exposes `generate(prompt)`.
+    """
+    @staticmethod
+    def generate(prompt: str, model: str = None, max_tokens: int = 512, **kwargs) -> str:
+        api_key = os.getenv('OPENAI_API_KEY')
+        if not api_key:
+            logger.error('OPENAI_API_KEY is not set')
+            raise EnvironmentError('Missing OPENAI_API_KEY')
+        openai.api_key = api_key
+        model_name = model or os.getenv('OPENAI_MODEL', 'gpt-4o')
+        try:
+            resp = openai.ChatCompletion.create(
+                model=model_name,
+                messages=[{"role": "system", "content": "You are a helpful assistant."},
+                          {"role": "user", "content": prompt}],
+                max_tokens=max_tokens,
+                temperature=0.0,
+                **kwargs
+            )
+            text = resp.choices[0].message.content.strip()
+            return text
+        except Exception as e:
+            logger.exception('LLM generation failed')
+            raise

tests/test.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import json
+import pytest
+import torch
+import numpy as np
+from src.gpp import parse_markdown_table, GPP, GPPConfig
+from src.qa import Retriever, RetrieverConfig, Reranker, RerankerConfig, AnswerGenerator
+from src.utils import LLMClient
+# --- Tests for parse_markdown_table ---
+def test_parse_markdown_table_valid():
+    md = """
+    |h1|h2|
+    |--|--|
+    |a|b|
+    |c|d|
+    """
+    res = parse_markdown_table(md)
+    assert res['headers'] == ['h1', 'h2']
+    assert res['rows'] == [['a', 'b'], ['c', 'd']]
+def test_parse_markdown_table_invalid():
+    md = "not a table"
+    assert parse_markdown_table(md) is None
+# --- Tests for GPP.chunk_blocks ---
+class DummyGPPConfig(GPPConfig):
+    CHUNK_TOKEN_SIZE = 4  # small threshold for testing
+@pytest.fixture
+def gpp():
+    return GPP(DummyGPPConfig())
+@pytest.fixture
+def blocks():
+    return [
+        {'type': 'text', 'text': 'one two three four'},
+        {'type': 'table', 'text': '|h|\n|-|\n|v|'},
+        {'type': 'text', 'text': 'five six'}
+    ]
+def test_chunk_blocks_table_isolation(gpp, blocks):
+    chunks = gpp.chunk_blocks(blocks)
+    # Expect 3 chunks: one text (4 tokens), one table, one text (2 tokens)
+    assert len(chunks) == 3
+    assert chunks[1]['type'] == 'table'
+    assert 'table_structure' in chunks[1]
+# --- Tests for Retriever.retrieve combining sparse & dense ---
+def test_retriever_combine_unique(monkeypatch):
+    chunks = [{'narration': 'a'}, {'narration': 'b'}, {'narration': 'c'}]
+    config = RetrieverConfig()
+    retr = Retriever(chunks, config)
+    # Monkey-patch methods
+    monkeypatch.setattr(Retriever, 'retrieve_sparse', lambda self, q, top_k: [chunks[0], chunks[1]])
+    monkeypatch.setattr(Retriever, 'retrieve_dense', lambda self, q, top_k: [chunks[1], chunks[2]])
+    combined = retr.retrieve('query', top_k=2)
+    assert combined == [chunks[0], chunks[1], chunks[2]]
+# --- Tests for Reranker.rerank with dummy model and tokenizer ---
+class DummyTokenizer:
+    def __call__(self, queries, contexts, padding, truncation, return_tensors):
+        batch = len(queries)
+        return {
+            'input_ids': torch.ones((batch, 1), dtype=torch.long),
+            'attention_mask': torch.ones((batch, 1), dtype=torch.long)
+        }
+class DummyModel:
+    def __init__(self): pass
+    def to(self, device): return self
+    def __call__(self, **kwargs):
+        # Generate logits: second candidate more relevant
+        batch = kwargs['input_ids'].shape[0]
+        logits = torch.tensor([[0.1], [0.9]]) if batch == 2 else torch.rand((batch,1))
+        return type('Out', (), {'logits': logits})
+@pytest.fixture(autouse=True)
+def dummy_pretrained(monkeypatch):
+    import transformers
+    monkeypatch.setattr(transformers.AutoTokenizer, 'from_pretrained', lambda name: DummyTokenizer())
+    monkeypatch.setattr(transformers.AutoModelForSequenceClassification, 'from_pretrained', lambda name: DummyModel())
+    return
+def test_reranker_order():
+    config = RerankerConfig()
+    rer = Reranker(config)
+    candidates = [{'narration': 'A'}, {'narration': 'B'}]
+    ranked = rer.rerank('q', candidates, top_k=2)
+    # B should be ranked higher than A
+    assert ranked[0]['narration'] == 'B'
+    assert ranked[1]['narration'] == 'A'
+# --- Tests for AnswerGenerator end-to-end logic ---
+def test_answer_generator(monkeypatch):
+    # Dummy chunks
+    chunks = [{'narration': 'hello world'}]
+    # Dummy Retriever and Reranker
+    class DummyRetriever:
+        def __init__(self, chunks, config): pass
+        def retrieve(self, q, top_k=10): return chunks
+    class DummyReranker:
+        def __init__(self, config): pass
+        def rerank(self, q, cands, top_k): return chunks
+    # Patch in dummy classes
+    monkeypatch.setattr('src.qa.Retriever', DummyRetriever)
+    monkeypatch.setattr('src.qa.Reranker', DummyReranker)
+    # Patch LLMClient.generate
+    monkeypatch.setattr(LLMClient, 'generate', staticmethod(lambda prompt: 'TEST_ANSWER'))
+    ag = AnswerGenerator()
+    ans, sup = ag.answer(chunks, 'What?')
+    assert ans == 'TEST_ANSWER'
+    assert sup == chunks