Abhinav Gavireddi commited on
Commit
3301b3c
·
1 Parent(s): e2fc494

initial commit

Browse files
Files changed (11) hide show
  1. .github/workflows/ci.yaml +36 -0
  2. Dockerfile +35 -0
  3. requirements.txt +12 -0
  4. src/__init__.py +35 -0
  5. src/app.py +120 -0
  6. src/config.py +30 -0
  7. src/gpp.py +273 -0
  8. src/qa.py +89 -0
  9. src/retriever.py +69 -0
  10. src/utils.py +55 -0
  11. tests/test.py +117 -0
.github/workflows/ci.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI & Deploy
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ jobs:
10
+ build-and-test:
11
+ # … your existing test setup …
12
+
13
+ deploy-to-hf:
14
+ needs: build-and-test
15
+ runs-on: ubuntu-latest
16
+ if: github.ref == 'refs/heads/main' # only on main branch
17
+ steps:
18
+ - name: Checkout repo
19
+ uses: actions/checkout@v3
20
+ with:
21
+ fetch-depth: 0 # needed to push full history
22
+
23
+ - name: Set up Docker credentials
24
+ run: echo "${{ secrets.HF_TOKEN }}" | docker login --username ${{ secrets.HF_USERNAME }} --password-stdin docker.pkg.github.com
25
+
26
+ - name: Install Hugging Face CLI
27
+ run: pip install huggingface_hub
28
+
29
+ - name: Log in to Hugging Face
30
+ run: |
31
+ huggingface-cli login --token ${{ secrets.HF_TOKEN }}
32
+
33
+ - name: Push to Hugging Face Space
34
+ run: |
35
+ git remote add hf https://huggingface.co/spaces/${{ secrets.HF_USERNAME }}/${{ secrets.HF_SPACE_NAME }}.git
36
+ git push hf main --force
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base image
2
+ FROM python:3.10-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # System dependencies
8
+ RUN apt-get update && \
9
+ apt-get install -y --no-install-recommends \
10
+ build-essential \
11
+ ffmpeg \
12
+ libgomp1 \ # for hnswlib (needed for OpenMP)
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Copy and install Python dependencies
16
+ COPY requirements.txt ./
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy application code
20
+ COPY src/ ./src/
21
+ COPY tests/ ./tests/
22
+ COPY app.py .
23
+
24
+ # Copy env file if you want local dev (optional)
25
+ # COPY .env .env
26
+
27
+ # Expose Streamlit port
28
+ EXPOSE 8501
29
+
30
+ # Set environment variables
31
+ ENV PYTHONUNBUFFERED=1
32
+ ENV TOKENIZERS_PARALLELISM=false
33
+
34
+ # Start Streamlit
35
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core
2
+ streamlit>=1.25.0
3
+ mineru>=0.1.0
4
+ sentence-transformers>=2.2.2
5
+ rank-bm25>=0.2.2
6
+ redis>=4.5.1
7
+ transformers>=4.29.2
8
+ torch>=2.0.0
9
+ openai>=0.27.0
10
+ huggingface-hub>=0.16.4
11
+ # For testing
12
+ pytest>=7.0
src/__init__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import bleach
4
+
5
+ import logging
6
+ import sys
7
+ import structlog
8
+
9
+ load_dotenv()
10
+
11
+ def configure_logging():
12
+ structlog.configure(
13
+ processors=[
14
+ structlog.processors.TimeStamper(fmt="iso"),
15
+ structlog.processors.JSONRenderer()
16
+ ],
17
+ context_class=dict,
18
+ logger_factory=structlog.stdlib.LoggerFactory(),
19
+ wrapper_class=structlog.stdlib.BoundLogger,
20
+ cache_logger_on_first_use=True,
21
+ )
22
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
23
+
24
+ def get_env(name):
25
+ val = os.getenv(name)
26
+ if not val:
27
+ raise RuntimeError(f"Missing required secret: {name}")
28
+ return val
29
+
30
+ def sanitize_html(raw):
31
+ # allow only text and basic tags
32
+ return bleach.clean(raw, tags=[], strip=True)
33
+
34
+ configure_logging()
35
+ logger = structlog.get_logger()
src/app.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from datetime import datetime
4
+
5
+ from src.gpp import GPP, GPPConfig
6
+ from src.qa import AnswerGenerator
7
+
8
+ # --- Custom CSS for styling ---
9
+ st.markdown(
10
+ """
11
+ <style>
12
+ body { background-color: #F5F7FA; }
13
+ .header { text-align: center; padding: 10px; }
14
+ .card { background: white; border-radius: 10px; padding: 15px; margin-bottom: 20px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
15
+ .stButton>button { background-color: #4A90E2; color: white; }
16
+ pre { background-color: #f0f0f0; padding: 10px; border-radius: 5px; }
17
+ </style>
18
+ """, unsafe_allow_html=True
19
+ )
20
+
21
+ # --- Page Configuration ---
22
+ st.set_page_config(
23
+ page_title="Document Intelligence Q&A",
24
+ layout="wide",
25
+ initial_sidebar_state="expanded"
26
+ )
27
+
28
+ # --- Header ---
29
+ st.markdown("<div class='header'>", unsafe_allow_html=True)
30
+ st.image("https://img.icons8.com/ios-filled/50/4A90E2/document.png", width=50)
31
+ st.title("Document Intelligence Q&A")
32
+ st.markdown(
33
+ "<p style='font-size:18px; color:#555;'>Upload any PDF and get instant insights via advanced RAG-powered Q&A.</p>",
34
+ unsafe_allow_html=True
35
+ )
36
+ st.markdown(
37
+ f"<p style='font-size:12px; color:#888;'>Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>",
38
+ unsafe_allow_html=True
39
+ )
40
+ st.markdown("</div>", unsafe_allow_html=True)
41
+
42
+ # --- Sidebar: Instructions ---
43
+ with st.sidebar:
44
+ st.header("How It Works")
45
+ st.markdown(
46
+ "1. Upload and parse your PDF; 2. LLM narrates tables/images and enriches context; 3. Hybrid retrieval surfaces relevant chunks; 4. Reranker refines and generates answer."
47
+ )
48
+ st.markdown("---")
49
+ st.markdown("&copy; 2025 Document Intelligence Team")
50
+
51
+ # --- Session State ---
52
+ if "parsed" not in st.session_state:
53
+ st.session_state.parsed = None
54
+
55
+ # --- Three-Column Layout ---
56
+ col1, col2, col3 = st.columns([2, 3, 3])
57
+
58
+ # --- Left Column: Upload & Layout ---
59
+ with col1:
60
+ st.header("1. Upload & Layout")
61
+ uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
62
+ if uploaded_file:
63
+ if st.button("Parse Document"):
64
+ output_dir = os.path.join("./parsed", uploaded_file.name)
65
+ os.makedirs(output_dir, exist_ok=True)
66
+ pdf_path = os.path.join(output_dir, uploaded_file.name)
67
+ with open(pdf_path, "wb") as f:
68
+ f.write(uploaded_file.getbuffer())
69
+ with st.spinner("Parsing document with MinerU and LLM...⏳"):
70
+ gpp = GPP(GPPConfig())
71
+ parsed = gpp.run(pdf_path, output_dir)
72
+ st.success("✅ Parsing complete!")
73
+ st.session_state.parsed = parsed
74
+ parsed = st.session_state.parsed
75
+ if parsed:
76
+ st.subheader("Layout Preview")
77
+ layout_pdf = parsed.get("layout_pdf")
78
+ if layout_pdf and os.path.exists(layout_pdf):
79
+ st.markdown(f"[Open Layout PDF]({layout_pdf})")
80
+ st.subheader("Extracted Content (Preview)")
81
+ md_path = parsed.get("md_path")
82
+ if md_path and os.path.exists(md_path):
83
+ md_text = open(md_path, 'r', encoding='utf-8').read()
84
+ st.markdown(f"<div class='card'><pre>{md_text[:2000]}{'...' if len(md_text)>2000 else ''}</pre></div>", unsafe_allow_html=True)
85
+
86
+ # --- Center Column: Q&A ---
87
+ with col2:
88
+ st.header("2. Ask a Question")
89
+ if parsed:
90
+ question = st.text_input("Type your question here:", placeholder="E.g., 'What was the Q2 revenue?'" )
91
+ if st.button("Get Answer") and question:
92
+ with st.spinner("Retrieving answer...🤖"):
93
+ generator = AnswerGenerator()
94
+ answer, supporting_chunks = generator.answer(parsed['chunks'], question)
95
+ st.markdown(f"<div class='card'><h3>Answer</h3><p>{answer}</p></div>", unsafe_allow_html=True)
96
+ st.markdown("<div class='card'><h4>Supporting Context</h4></div>", unsafe_allow_html=True)
97
+ for sc in supporting_chunks:
98
+ st.write(f"- {sc['narration']}")
99
+ else:
100
+ st.info("Upload and parse a document to ask questions.")
101
+
102
+ # --- Right Column: Chunks ---
103
+ with col3:
104
+ st.header("3. Relevant Chunks")
105
+ if parsed:
106
+ chunks = parsed.get('chunks', [])
107
+ for idx, chunk in enumerate(chunks):
108
+ with st.expander(f"Chunk {idx} - {chunk['type'].capitalize()}"):
109
+ st.write(chunk.get('narration', ''))
110
+ if 'table_structure' in chunk:
111
+ st.write("**Parsed Table:**")
112
+ st.table(chunk['table_structure'])
113
+ for blk in chunk.get('blocks', []):
114
+ if blk.get('type') == 'img_path':
115
+ img_path = os.path.join(parsed['images_dir'], blk.get('img_path',''))
116
+ if os.path.exists(img_path):
117
+ st.image(img_path, caption=os.path.basename(img_path))
118
+ st.info(f"Total chunks: {len(chunks)}")
119
+ else:
120
+ st.info("No chunks to display. Parse a document first.")
src/config.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Central configuration for the entire Document Intelligence app.
3
+ All modules import from here rather than hard-coding values.
4
+ """
5
+ import os
6
+
7
+ class RedisConfig:
8
+ HOST = os.getenv('REDIS_HOST', 'localhost')
9
+ PORT = int(os.getenv('REDIS_PORT', 6379))
10
+ DB = int(os.getenv('REDIS_DB', 0))
11
+ VECTOR_INDEX = os.getenv('REDIS_VECTOR_INDEX', 'gpp_vectors')
12
+
13
+ class EmbeddingConfig:
14
+ TEXT_MODEL = os.getenv('TEXT_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
15
+ META_MODEL = os.getenv('META_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
16
+
17
+ class RetrieverConfig:
18
+ TOP_K = int(os.getenv('RETRIEVER_TOP_K', 10))
19
+
20
+ class RerankerConfig:
21
+ MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')
22
+ DEVICE = os.getenv('RERANKER_DEVICE', 'cuda' if os.getenv('CUDA_VISIBLE_DEVICES') else 'cpu')
23
+
24
+ class GPPConfig:
25
+ CHUNK_TOKEN_SIZE = int(os.getenv('CHUNK_TOKEN_SIZE', 256))
26
+ DEDUP_SIM_THRESHOLD = float(os.getenv('DEDUP_SIM_THRESHOLD', 0.9))
27
+ EXPANSION_SIM_THRESHOLD = float(os.getenv('EXPANSION_SIM_THRESHOLD', 0.85))
28
+ COREF_CONTEXT_SIZE = int(os.getenv('COREF_CONTEXT_SIZE', 3))
29
+
30
+ # Add other configs (e.g. Streamlit settings, CI flags) as needed.
src/gpp.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generic Pre-Processing Pipeline (GPP) for Document Intelligence
3
+
4
+ This module handles:
5
+ 1. Parsing PDFs via MinerU Python API (OCR/text modes)
6
+ 2. Extracting markdown, images, and content_list JSON
7
+ 3. Chunking multimodal content (text, tables, images), ensuring tables/images are in single chunks
8
+ 4. Parsing markdown tables into JSON 2D structures for dense tables
9
+ 5. Narration of tables/images via LLM
10
+ 6. Semantic enhancements (deduplication, coreference, metadata summarization)
11
+ 7. Embedding computation and storage in Redis & BM25
12
+
13
+ Each step is modular to support swapping components (e.g. different parsers or stores).
14
+ """
15
+ import os
16
+ import json
17
+ import logging
18
+ from typing import List, Dict, Any, Optional
19
+ import re
20
+
21
+ from mineru.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
22
+ from mineru.data.dataset import PymuDocDataset
23
+ from mineru.model.doc_analyze_by_custom_model import doc_analyze
24
+ from mineru.config.enums import SupportedPdfParseMethod
25
+
26
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
27
+ from sentence_transformers import SentenceTransformer
28
+ from rank_bm25 import BM25Okapi
29
+ import numpy as np
30
+ import redis
31
+
32
+ # LLM client abstraction
33
+ from src.utils import LLMClient
34
+
35
+ # Configure logging
36
+ logger = logging.getLogger(__name__)
37
+ logging.basicConfig(level=logging.INFO)
38
+
39
+
40
+ def parse_markdown_table(md: str) -> Optional[Dict[str, Any]]:
41
+ """
42
+ Parses a markdown table into a JSON-like dict:
43
+ { headers: [...], rows: [[...], ...] }
44
+ Handles multi-level headers by nesting lists if needed.
45
+ """
46
+ lines = [l for l in md.strip().splitlines() if l.strip().startswith('|')]
47
+ if len(lines) < 2:
48
+ return None
49
+ header_line = lines[0]
50
+ sep_line = lines[1]
51
+ # Validate separator line
52
+ if not re.match(r"^\|?\s*:?-+:?\s*(\|\s*:?-+:?\s*)+\|?", sep_line):
53
+ return None
54
+ def split_row(line):
55
+ parts = [cell.strip() for cell in line.strip().strip('|').split('|')]
56
+ return parts
57
+ headers = split_row(header_line)
58
+ rows = [split_row(r) for r in lines[2:]]
59
+ return {'headers': headers, 'rows': rows}
60
+
61
+ class GPPConfig:
62
+ """
63
+ Configuration for GPP pipeline.
64
+ """
65
+ CHUNK_TOKEN_SIZE = 256
66
+ DEDUP_SIM_THRESHOLD = 0.9
67
+ EXPANSION_SIM_THRESHOLD = 0.85
68
+ COREF_CONTEXT_SIZE = 3
69
+
70
+ # Embedding models
71
+ TEXT_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
72
+ META_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
73
+
74
+ # Redis settings
75
+ REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
76
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
77
+ REDIS_DB = int(os.getenv('REDIS_DB', 0))
78
+ REDIS_VECTOR_INDEX = 'gpp_vectors'
79
+
80
+ class GPP:
81
+ def __init__(self, config: GPPConfig):
82
+ self.config = config
83
+ # Embedding models
84
+ self.text_embedder = SentenceTransformer(config.TEXT_EMBED_MODEL)
85
+ self.meta_embedder = SentenceTransformer(config.META_EMBED_MODEL)
86
+ # Redis for vectors + metadata
87
+ self.redis = redis.Redis(host=config.REDIS_HOST,
88
+ port=config.REDIS_PORT,
89
+ db=config.REDIS_DB)
90
+ self.bm25 = None
91
+
92
+ def parse_pdf(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
93
+ """
94
+ Uses MinerU API to parse PDF in OCR/text mode,
95
+ dumps markdown, images, layout PDF, content_list JSON.
96
+ Returns parsed data plus file paths for UI traceability.
97
+ """
98
+ name = os.path.splitext(os.path.basename(pdf_path))[0]
99
+ img_dir = os.path.join(output_dir, 'images')
100
+ os.makedirs(img_dir, exist_ok=True)
101
+ os.makedirs(output_dir, exist_ok=True)
102
+
103
+ writer_imgs = FileBasedDataWriter(img_dir)
104
+ writer_md = FileBasedDataWriter(output_dir)
105
+ reader = FileBasedDataReader("")
106
+ pdf_bytes = reader.read(pdf_path)
107
+ ds = PymuDocDataset(pdf_bytes)
108
+ if ds.classify() == SupportedPdfParseMethod.OCR:
109
+ infer = ds.apply(doc_analyze, ocr=True)
110
+ pipe = infer.pipe_ocr_mode(writer_imgs)
111
+ else:
112
+ infer = ds.apply(doc_analyze, ocr=False)
113
+ pipe = infer.pipe_txt_mode(writer_imgs)
114
+ # Visual layout
115
+ pipe.draw_layout(os.path.join(output_dir, f"{name}_layout.pdf"))
116
+ # Dump markdown & JSON
117
+ pipe.dump_md(writer_md, f"{name}.md", os.path.basename(img_dir))
118
+ pipe.dump_content_list(writer_md, f"{name}_content_list.json", os.path.basename(img_dir))
119
+
120
+ content_list_path = os.path.join(output_dir, f"{name}_content_list.json")
121
+ with open(content_list_path, 'r', encoding='utf-8') as f:
122
+ data = json.load(f)
123
+ # UI traceability paths
124
+ data.update({
125
+ 'md_path': os.path.join(output_dir, f"{name}.md"),
126
+ 'images_dir': img_dir,
127
+ 'layout_pdf': os.path.join(output_dir, f"{name}_layout.pdf")
128
+ })
129
+ return data
130
+
131
+ def chunk_blocks(self, blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
132
+ """
133
+ Creates chunks of ~CHUNK_TOKEN_SIZE tokens, but ensures any table/image block
134
+ becomes its own chunk (unsplittable), flushing current text chunk as needed.
135
+ """
136
+ chunks, current, token_count = [], {'text': '', 'type': None, 'blocks': []}, 0
137
+ for blk in blocks:
138
+ btype = blk.get('type')
139
+ text = blk.get('text', '')
140
+ if btype in ('table', 'img_path'):
141
+ # Flush existing text chunk
142
+ if current['blocks']:
143
+ chunks.append(current)
144
+ current = {'text': '', 'type': None, 'blocks': []}
145
+ token_count = 0
146
+ # Create isolated chunk for the table/image
147
+ tbl_chunk = {'text': text, 'type': btype, 'blocks': [blk]}
148
+ # Parse markdown table into JSON structure if applicable
149
+ if btype == 'table':
150
+ tbl_struct = parse_markdown_table(text)
151
+ tbl_chunk['table_structure'] = tbl_struct
152
+ chunks.append(tbl_chunk)
153
+ continue
154
+ # Standard text accumulation
155
+ count = len(text.split())
156
+ if token_count + count > self.config.CHUNK_TOKEN_SIZE and current['blocks']:
157
+ chunks.append(current)
158
+ current = {'text': '', 'type': None, 'blocks': []}
159
+ token_count = 0
160
+ current['text'] += text + '\n'
161
+ current['type'] = current['type'] or btype
162
+ current['blocks'].append(blk)
163
+ token_count += count
164
+ # Flush remaining
165
+ if current['blocks']:
166
+ chunks.append(current)
167
+ logger.info(f"Chunked into {len(chunks)} pieces (with tables/images isolated).")
168
+ return chunks
169
+
170
+ def narrate_multimodal(self, chunks: List[Dict[str, Any]]) -> None:
171
+ """
172
+ For table/image chunks, generate LLM narration. Preserve table_structure in metadata.
173
+ """
174
+ for c in chunks:
175
+ if c['type'] in ('table', 'img_path'):
176
+ prompt = f"Describe this {c['type']} concisely:\n{c['text']}"
177
+ c['narration'] = LLMClient.generate(prompt)
178
+ else:
179
+ c['narration'] = c['text']
180
+
181
+ def deduplicate(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
182
+ """
183
+ Drops near-duplicate narrations via cosine sim > threshold.
184
+ """
185
+ embs = self.text_embedder.encode([c['narration'] for c in chunks], convert_to_tensor=True)
186
+ keep = []
187
+ for i, emb in enumerate(embs):
188
+ if not any((emb @ embs[j]).item() / (np.linalg.norm(emb) * np.linalg.norm(embs[j]))
189
+ > self.config.DEDUP_SIM_THRESHOLD for j in keep):
190
+ keep.append(i)
191
+ deduped = [chunks[i] for i in keep]
192
+ logger.info(f"Deduplicated: {len(chunks)}→{len(deduped)}")
193
+ return deduped
194
+
195
+ def coref_resolution(self, chunks: List[Dict[str, Any]]) -> None:
196
+ """
197
+ Resolve pronouns using preceding context via LLM.
198
+ """
199
+ for idx, c in enumerate(chunks):
200
+ start = max(0, idx-self.config.COREF_CONTEXT_SIZE)
201
+ ctx = "\n".join(chunks[i]['narration'] for i in range(start, idx))
202
+ prompt = f"Context:\n{ctx}\nRewrite pronouns in:\n{c['narration']}"
203
+ c['narration'] = LLMClient.generate(prompt)
204
+
205
+ def metadata_summarization(self, chunks: List[Dict[str, Any]]) -> None:
206
+ """
207
+ Summarize sections and attach to metadata for self-contained context.
208
+ """
209
+ sections: Dict[str, List[Dict[str, Any]]] = {}
210
+ for c in chunks:
211
+ sec = c.get('section', 'default')
212
+ sections.setdefault(sec, []).append(c)
213
+ for sec, items in sections.items():
214
+ blob = "\n".join(i['narration'] for i in items)
215
+ summ = LLMClient.generate(f"Summarize this section:\n{blob}")
216
+ for i in items:
217
+ i.setdefault('metadata', {})['section_summary'] = summ
218
+
219
+ def build_bm25(self, chunks: List[Dict[str, Any]]) -> None:
220
+ """
221
+ Build BM25 index on token lists for sparse retrieval.
222
+ """
223
+ tokenized = [c['narration'].split() for c in chunks]
224
+ self.bm25 = BM25Okapi(tokenized)
225
+
226
+ def compute_and_store(self, chunks: List[Dict[str, Any]]) -> None:
227
+ """
228
+ Encode narrations & metadata, store vectors and chunk metadata in Redis.
229
+ """
230
+ txts = [c['narration'] for c in chunks]
231
+ metas = [c.get('metadata', {}).get('section_summary', '') for c in chunks]
232
+ txt_embs = self.text_embedder.encode(txts)
233
+ meta_embs = self.meta_embedder.encode(metas)
234
+
235
+ pipe = self.redis.pipeline()
236
+ for i, (c, te) in enumerate(zip(chunks, txt_embs)):
237
+ key = f"chunk:{i}"
238
+ # store metadata
239
+ store = {'narration': c['narration'], 'type': c['type']}
240
+ if 'table_structure' in c:
241
+ store['table_structure'] = json.dumps(c['table_structure'])
242
+ pipe.hset(key, mapping=store)
243
+ # store dense vector
244
+ pipe.hset(self.config.REDIS_VECTOR_INDEX, key, te.tobytes())
245
+ pipe.execute()
246
+ logger.info("Stored embeddings and metadata in Redis.")
247
+
248
+ def run(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
249
+ """
250
+ Executes full GPP: parse → chunk → narrate → enhance → index.
251
+ Returns parse output dict augmented with `chunks` for downstream processes.
252
+ """
253
+ parsed = self.parse_pdf(pdf_path, output_dir)
254
+ blocks = parsed.get('blocks', [])
255
+ chunks = self.chunk_blocks(blocks)
256
+ self.narrate_multimodal(chunks)
257
+ chunks = self.deduplicate(chunks)
258
+ self.coref_resolution(chunks)
259
+ self.metadata_summarization(chunks)
260
+ self.build_bm25(chunks)
261
+ self.compute_and_store(chunks)
262
+ parsed['chunks'] = chunks
263
+ logger.info("GPP pipeline complete.")
264
+ return parsed
265
+
266
+ if __name__ == '__main__':
267
+ import argparse
268
+ parser = argparse.ArgumentParser()
269
+ parser.add_argument('pdf')
270
+ parser.add_argument('outdir')
271
+ args = parser.parse_args()
272
+ gpp = GPP(GPPConfig())
273
+ gpp.run(args.pdf, args.outdir)
src/qa.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AnswerGenerator: orchestrates retrieval, re-ranking, and answer generation.
3
+
4
+ This module contains:
5
+ - Retriever: Hybrid BM25 + dense retrieval over parsed chunks
6
+ - Reranker: Cross-encoder based re-ranking of candidate chunks
7
+ - AnswerGenerator: ties together retrieval, re-ranking, and LLM generation
8
+
9
+ Each component is modular and can be swapped or extended (e.g., add HyDE retriever).
10
+ """
11
+ import os
12
+ import json
13
+ import numpy as np
14
+ import redis
15
+ from typing import List, Dict, Any, Tuple
16
+
17
+ from sentence_transformers import SentenceTransformer
18
+ from rank_bm25 import BM25Okapi
19
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
20
+ import torch
21
+
22
+ from src import sanitize_html
23
+ from src.utils import LLMClient, logger
24
+ from src.retriever import Retriever, RetrieverConfig
25
+
26
+
27
+ class RerankerConfig:
28
+ MODEL_NAME = 'BAAI/bge-reranker-v2-Gemma'
29
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
30
+
31
+ class Reranker:
32
+ """
33
+ Cross-encoder re-ranker using a transformer-based sequence classification model.
34
+ """
35
+ def __init__(self, config: RerankerConfig):
36
+ self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
37
+ self.model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME)
38
+ self.model.to(config.DEVICE)
39
+
40
+ def rerank(self, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]:
41
+ """Score each candidate and return top_k sorted by relevance."""
42
+ inputs = self.tokenizer(
43
+ [query] * len(candidates),
44
+ [c['narration'] for c in candidates],
45
+ padding=True,
46
+ truncation=True,
47
+ return_tensors='pt'
48
+ ).to(RerankerConfig.DEVICE)
49
+ with torch.no_grad():
50
+ logits = self.model(**inputs).logits.squeeze(-1)
51
+ scores = torch.sigmoid(logits).cpu().numpy()
52
+ # pair and sort
53
+ paired = list(zip(candidates, scores))
54
+ ranked = sorted(paired, key=lambda x: x[1], reverse=True)
55
+ return [c for c, _ in ranked[:top_k]]
56
+
57
+
58
+ class AnswerGenerator:
59
+ """
60
+ Main interface: given parsed chunks and a question, returns answer and supporting chunks.
61
+ """
62
+ def __init__(self):
63
+ self.ret_config = RetrieverConfig()
64
+ self.rerank_config = RerankerConfig()
65
+
66
+ def answer(self, chunks: List[Dict[str, Any]], question: str) -> Tuple[str, List[Dict[str, Any]]]:
67
+ logger.info('Answering question', question=question)
68
+ question = sanitize_html(question)
69
+ # 1. Retrieval
70
+ retriever = Retriever(chunks, self.ret_config)
71
+ candidates = retriever.retrieve(question)
72
+ # 2. Re-ranking
73
+ reranker = Reranker(self.rerank_config)
74
+ top_chunks = reranker.rerank(question, candidates, top_k=5)
75
+ # 3. Assemble prompt
76
+ context = "\n\n".join([f"- {c['narration']}" for c in top_chunks])
77
+ prompt = (
78
+ f"You are a knowledgeable assistant. "
79
+ f"Use the following extracted document snippets to answer the question."
80
+ f"\n\nContext:\n{context}"
81
+ f"\n\nQuestion: {question}\nAnswer:"
82
+ )
83
+ # 4. Generate answer
84
+ answer = LLMClient.generate(prompt)
85
+ return answer, top_chunks
86
+
87
+ # Example usage:
88
+ # generator = AnswerGenerator()
89
+ # ans, ctx = generator.answer(parsed_chunks, "What was the Q2 revenue?")
src/retriever.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import redis
4
+ import hnswlib
5
+ from typing import List, Dict, Any
6
+
7
+ from sentence_transformers import SentenceTransformer
8
+ from rank_bm25 import BM25Okapi
9
+
10
+ class RetrieverConfig:
11
+ TOP_K = 10 # number of candidates per retrieval path
12
+ DENSE_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
13
+ REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
14
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
15
+ REDIS_DB = int(os.getenv('REDIS_DB', 0))
16
+ REDIS_VECTOR_INDEX = 'gpp_vectors'
17
+
18
+ class Retriever:
19
+ """
20
+ Hybrid retriever combining BM25 sparse and Redis-based dense retrieval.
21
+ """
22
+ def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
23
+ self.chunks = chunks
24
+ # Build BM25 index over chunk narrations
25
+ corpus = [c['narration'].split() for c in chunks]
26
+ self.bm25 = BM25Okapi(corpus)
27
+ # Load dense embedder
28
+ self.embedder = SentenceTransformer(config.DENSE_MODEL)
29
+ # Connect to Redis for vector store
30
+ self.redis = redis.Redis(host=config.REDIS_HOST,
31
+ port=config.REDIS_PORT,
32
+ db=config.REDIS_DB)
33
+ self.vector_index = config.REDIS_VECTOR_INDEX
34
+
35
+ # Build HNSW index
36
+ dim = len(self.embedder.encode(["test"])[0])
37
+ self.ann = hnswlib.Index(space='cosine', dim=dim)
38
+ self.ann.init_index(max_elements=len(chunks), ef_construction=200, M=16)
39
+ embeddings = self.embedder.encode([c['narration'] for c in chunks])
40
+ self.ann.add_items(embeddings, ids=list(range(len(chunks))))
41
+ self.ann.set_ef(50) # ef should be > top_k for accuracy
42
+
43
+ def retrieve_sparse(self, query: str, top_k: int) -> List[Dict[str, Any]]:
44
+ """Return top_k chunks by BM25 score."""
45
+ tokenized = query.split()
46
+ scores = self.bm25.get_scores(tokenized)
47
+ top_indices = np.argsort(scores)[::-1][:top_k]
48
+ return [self.chunks[i] for i in top_indices]
49
+
50
+ def retrieve_dense(self, query: str, top_k: int) -> List[Dict[str, Any]]:
51
+ """Return top_k chunks by dense cosine similarity via Redis vectors."""
52
+ # Embed query
53
+ q_emb = self.embedder.encode([query])[0]
54
+ labels, distances = self.ann.knn_query(q_emb, k=top_k)
55
+ return [self.chunks[i] for i in labels[0]]
56
+
57
+ def retrieve(self, query: str, top_k: int = RetrieverConfig.TOP_K) -> List[Dict[str, Any]]:
58
+ """Combine sparse + dense results (unique) into candidate pool."""
59
+ sparse = self.retrieve_sparse(query, top_k)
60
+ dense = self.retrieve_dense(query, top_k)
61
+ # Union while preserving order
62
+ seen = set()
63
+ combined = []
64
+ for c in sparse + dense:
65
+ cid = id(c)
66
+ if cid not in seen:
67
+ seen.add(cid)
68
+ combined.append(c)
69
+ return combined
src/utils.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities module: LLM client wrapper and shared helpers.
3
+ """
4
+ import os
5
+ import openai
6
+ import logging
7
+ import sys
8
+ import structlog
9
+
10
+
11
+ def configure_logging():
12
+ structlog.configure(
13
+ processors=[
14
+ structlog.processors.TimeStamper(fmt="iso"),
15
+ structlog.processors.JSONRenderer()
16
+ ],
17
+ context_class=dict,
18
+ logger_factory=structlog.stdlib.LoggerFactory(),
19
+ wrapper_class=structlog.stdlib.BoundLogger,
20
+ cache_logger_on_first_use=True,
21
+ )
22
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
23
+
24
+
25
+ configure_logging()
26
+ logger = structlog.get_logger()
27
+
28
+ class LLMClient:
29
+ """
30
+ Simple wrapper around OpenAI (or any other) LLM API.
31
+ Reads API key from environment and exposes `generate(prompt)`.
32
+ """
33
+ @staticmethod
34
+ def generate(prompt: str, model: str = None, max_tokens: int = 512, **kwargs) -> str:
35
+ api_key = os.getenv('OPENAI_API_KEY')
36
+ if not api_key:
37
+ logger.error('OPENAI_API_KEY is not set')
38
+ raise EnvironmentError('Missing OPENAI_API_KEY')
39
+ openai.api_key = api_key
40
+ model_name = model or os.getenv('OPENAI_MODEL', 'gpt-4o')
41
+ try:
42
+ resp = openai.ChatCompletion.create(
43
+ model=model_name,
44
+ messages=[{"role": "system", "content": "You are a helpful assistant."},
45
+ {"role": "user", "content": prompt}],
46
+ max_tokens=max_tokens,
47
+ temperature=0.0,
48
+ **kwargs
49
+ )
50
+ text = resp.choices[0].message.content.strip()
51
+ return text
52
+ except Exception as e:
53
+ logger.exception('LLM generation failed')
54
+ raise
55
+
tests/test.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pytest
4
+ import torch
5
+ import numpy as np
6
+
7
+ from src.gpp import parse_markdown_table, GPP, GPPConfig
8
+ from src.qa import Retriever, RetrieverConfig, Reranker, RerankerConfig, AnswerGenerator
9
+ from src.utils import LLMClient
10
+
11
+ # --- Tests for parse_markdown_table ---
12
+ def test_parse_markdown_table_valid():
13
+ md = """
14
+ |h1|h2|
15
+ |--|--|
16
+ |a|b|
17
+ |c|d|
18
+ """
19
+ res = parse_markdown_table(md)
20
+ assert res['headers'] == ['h1', 'h2']
21
+ assert res['rows'] == [['a', 'b'], ['c', 'd']]
22
+
23
+
24
+ def test_parse_markdown_table_invalid():
25
+ md = "not a table"
26
+ assert parse_markdown_table(md) is None
27
+
28
+ # --- Tests for GPP.chunk_blocks ---
29
+ class DummyGPPConfig(GPPConfig):
30
+ CHUNK_TOKEN_SIZE = 4 # small threshold for testing
31
+
32
+ @pytest.fixture
33
+ def gpp():
34
+ return GPP(DummyGPPConfig())
35
+
36
+ @pytest.fixture
37
+ def blocks():
38
+ return [
39
+ {'type': 'text', 'text': 'one two three four'},
40
+ {'type': 'table', 'text': '|h|\n|-|\n|v|'},
41
+ {'type': 'text', 'text': 'five six'}
42
+ ]
43
+
44
+ def test_chunk_blocks_table_isolation(gpp, blocks):
45
+ chunks = gpp.chunk_blocks(blocks)
46
+ # Expect 3 chunks: one text (4 tokens), one table, one text (2 tokens)
47
+ assert len(chunks) == 3
48
+ assert chunks[1]['type'] == 'table'
49
+ assert 'table_structure' in chunks[1]
50
+
51
+ # --- Tests for Retriever.retrieve combining sparse & dense ---
52
+ def test_retriever_combine_unique(monkeypatch):
53
+ chunks = [{'narration': 'a'}, {'narration': 'b'}, {'narration': 'c'}]
54
+ config = RetrieverConfig()
55
+ retr = Retriever(chunks, config)
56
+ # Monkey-patch methods
57
+ monkeypatch.setattr(Retriever, 'retrieve_sparse', lambda self, q, top_k: [chunks[0], chunks[1]])
58
+ monkeypatch.setattr(Retriever, 'retrieve_dense', lambda self, q, top_k: [chunks[1], chunks[2]])
59
+ combined = retr.retrieve('query', top_k=2)
60
+ assert combined == [chunks[0], chunks[1], chunks[2]]
61
+
62
+ # --- Tests for Reranker.rerank with dummy model and tokenizer ---
63
+ class DummyTokenizer:
64
+ def __call__(self, queries, contexts, padding, truncation, return_tensors):
65
+ batch = len(queries)
66
+ return {
67
+ 'input_ids': torch.ones((batch, 1), dtype=torch.long),
68
+ 'attention_mask': torch.ones((batch, 1), dtype=torch.long)
69
+ }
70
+
71
+ class DummyModel:
72
+ def __init__(self): pass
73
+ def to(self, device): return self
74
+ def __call__(self, **kwargs):
75
+ # Generate logits: second candidate more relevant
76
+ batch = kwargs['input_ids'].shape[0]
77
+ logits = torch.tensor([[0.1], [0.9]]) if batch == 2 else torch.rand((batch,1))
78
+ return type('Out', (), {'logits': logits})
79
+
80
+ @pytest.fixture(autouse=True)
81
+ def dummy_pretrained(monkeypatch):
82
+ import transformers
83
+ monkeypatch.setattr(transformers.AutoTokenizer, 'from_pretrained', lambda name: DummyTokenizer())
84
+ monkeypatch.setattr(transformers.AutoModelForSequenceClassification, 'from_pretrained', lambda name: DummyModel())
85
+ return
86
+
87
+ def test_reranker_order():
88
+ config = RerankerConfig()
89
+ rer = Reranker(config)
90
+ candidates = [{'narration': 'A'}, {'narration': 'B'}]
91
+ ranked = rer.rerank('q', candidates, top_k=2)
92
+ # B should be ranked higher than A
93
+ assert ranked[0]['narration'] == 'B'
94
+ assert ranked[1]['narration'] == 'A'
95
+
96
+ # --- Tests for AnswerGenerator end-to-end logic ---
97
+ def test_answer_generator(monkeypatch):
98
+ # Dummy chunks
99
+ chunks = [{'narration': 'hello world'}]
100
+ # Dummy Retriever and Reranker
101
+ class DummyRetriever:
102
+ def __init__(self, chunks, config): pass
103
+ def retrieve(self, q, top_k=10): return chunks
104
+ class DummyReranker:
105
+ def __init__(self, config): pass
106
+ def rerank(self, q, cands, top_k): return chunks
107
+
108
+ # Patch in dummy classes
109
+ monkeypatch.setattr('src.qa.Retriever', DummyRetriever)
110
+ monkeypatch.setattr('src.qa.Reranker', DummyReranker)
111
+ # Patch LLMClient.generate
112
+ monkeypatch.setattr(LLMClient, 'generate', staticmethod(lambda prompt: 'TEST_ANSWER'))
113
+
114
+ ag = AnswerGenerator()
115
+ ans, sup = ag.answer(chunks, 'What?')
116
+ assert ans == 'TEST_ANSWER'
117
+ assert sup == chunks