Abhinav Gavireddi commited on
Commit
33f4e34
·
1 Parent(s): 3301b3c

fix: removed redis to store embeddings in memory

Browse files
Files changed (10) hide show
  1. .github/workflows/ci.yaml +30 -13
  2. Dockerfile +8 -6
  3. src/app.py → app.py +73 -42
  4. requirements.txt +8 -2
  5. src/__init__.py +2 -1
  6. src/config.py +3 -1
  7. src/gpp.py +39 -69
  8. src/qa.py +45 -35
  9. src/retriever.py +82 -39
  10. src/utils.py +22 -27
.github/workflows/ci.yaml CHANGED
@@ -8,29 +8,46 @@ on:
8
 
9
  jobs:
10
  build-and-test:
11
- # … your existing test setup …
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  deploy-to-hf:
14
  needs: build-and-test
15
  runs-on: ubuntu-latest
16
- if: github.ref == 'refs/heads/main' # only on main branch
17
  steps:
18
  - name: Checkout repo
19
  uses: actions/checkout@v3
20
  with:
21
- fetch-depth: 0 # needed to push full history
22
 
23
- - name: Set up Docker credentials
24
- run: echo "${{ secrets.HF_TOKEN }}" | docker login --username ${{ secrets.HF_USERNAME }} --password-stdin docker.pkg.github.com
25
 
26
- - name: Install Hugging Face CLI
27
- run: pip install huggingface_hub
28
 
29
- - name: Log in to Hugging Face
30
- run: |
31
- huggingface-cli login --token ${{ secrets.HF_TOKEN }}
 
 
32
 
33
- - name: Push to Hugging Face Space
 
34
  run: |
35
- git remote add hf https://huggingface.co/spaces/${{ secrets.HF_USERNAME }}/${{ secrets.HF_SPACE_NAME }}.git
36
- git push hf main --force
 
8
 
9
  jobs:
10
  build-and-test:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Checkout repo
14
+ uses: actions/checkout@v3
15
+ - name: Set up Python
16
+ uses: actions/setup-python@v4
17
+ with:
18
+ python-version: '3.10'
19
+ - name: Install dependencies
20
+ run: |
21
+ python -m pip install --upgrade pip
22
+ pip install -r requirements.txt
23
+ - name: Run tests
24
+ run: |
25
+ if [ -f tests/test.py ]; then python -m unittest discover -s tests; fi
26
 
27
  deploy-to-hf:
28
  needs: build-and-test
29
  runs-on: ubuntu-latest
30
+ if: github.ref == 'refs/heads/main'
31
  steps:
32
  - name: Checkout repo
33
  uses: actions/checkout@v3
34
  with:
35
+ fetch-depth: 0
36
 
37
+ - name: Set up Docker Buildx
38
+ uses: docker/setup-buildx-action@v3
39
 
40
+ - name: Log in to Hugging Face Docker
41
+ run: echo "${{ secrets.HF_TOKEN }}" | docker login --username ${{ secrets.HF_USERNAME }} --password-stdin docker.io
42
 
43
+ - name: Build Docker image
44
+ run: docker build -t docker.io/${{ secrets.HF_USERNAME }}/${{ secrets.HF_SPACE_NAME }}:latest .
45
+
46
+ - name: Push Docker image to Hugging Face
47
+ run: docker push docker.io/${{ secrets.HF_USERNAME }}/${{ secrets.HF_SPACE_NAME }}:latest
48
 
49
+ # Optionally, trigger a Space restart via the API (not strictly required)
50
+ - name: Restart Hugging Face Space
51
  run: |
52
+ pip install huggingface_hub
53
+ python -c "from huggingface_hub import HfApi; HfApi(token='${{ secrets.HF_TOKEN }}').restart_space('${{ secrets.HF_USERNAME }}', '${{ secrets.HF_SPACE_NAME }}')"
Dockerfile CHANGED
@@ -1,6 +1,9 @@
1
  # Base image
2
  FROM python:3.10-slim
3
 
 
 
 
4
  # Set working directory
5
  WORKDIR /app
6
 
@@ -9,7 +12,9 @@ RUN apt-get update && \
9
  apt-get install -y --no-install-recommends \
10
  build-essential \
11
  ffmpeg \
12
- libgomp1 \ # for hnswlib (needed for OpenMP)
 
 
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
  # Copy and install Python dependencies
@@ -21,15 +26,12 @@ COPY src/ ./src/
21
  COPY tests/ ./tests/
22
  COPY app.py .
23
 
24
- # Copy env file if you want local dev (optional)
25
- # COPY .env .env
26
-
27
  # Expose Streamlit port
28
- EXPOSE 8501
29
 
30
  # Set environment variables
31
  ENV PYTHONUNBUFFERED=1
32
  ENV TOKENIZERS_PARALLELISM=false
33
 
34
  # Start Streamlit
35
- ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
  # Base image
2
  FROM python:3.10-slim
3
 
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+
7
  # Set working directory
8
  WORKDIR /app
9
 
 
12
  apt-get install -y --no-install-recommends \
13
  build-essential \
14
  ffmpeg \
15
+
16
+ # for hnswlib (needed for OpenMP)
17
+ libgomp1 \
18
  && rm -rf /var/lib/apt/lists/*
19
 
20
  # Copy and install Python dependencies
 
26
  COPY tests/ ./tests/
27
  COPY app.py .
28
 
 
 
 
29
  # Expose Streamlit port
30
+ EXPOSE 7860
31
 
32
  # Set environment variables
33
  ENV PYTHONUNBUFFERED=1
34
  ENV TOKENIZERS_PARALLELISM=false
35
 
36
  # Start Streamlit
37
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
src/app.py → app.py RENAMED
@@ -1,6 +1,8 @@
1
  import os
2
  import streamlit as st
3
  from datetime import datetime
 
 
4
 
5
  from src.gpp import GPP, GPPConfig
6
  from src.qa import AnswerGenerator
@@ -60,42 +62,65 @@ with col1:
60
  st.header("1. Upload & Layout")
61
  uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
62
  if uploaded_file:
63
- if st.button("Parse Document"):
64
- output_dir = os.path.join("./parsed", uploaded_file.name)
65
- os.makedirs(output_dir, exist_ok=True)
66
- pdf_path = os.path.join(output_dir, uploaded_file.name)
67
- with open(pdf_path, "wb") as f:
68
- f.write(uploaded_file.getbuffer())
69
- with st.spinner("Parsing document with MinerU and LLM...⏳"):
70
- gpp = GPP(GPPConfig())
71
- parsed = gpp.run(pdf_path, output_dir)
72
- st.success("✅ Parsing complete!")
73
- st.session_state.parsed = parsed
 
 
 
 
 
 
 
 
 
 
74
  parsed = st.session_state.parsed
75
  if parsed:
76
- st.subheader("Layout Preview")
77
- layout_pdf = parsed.get("layout_pdf")
78
- if layout_pdf and os.path.exists(layout_pdf):
79
- st.markdown(f"[Open Layout PDF]({layout_pdf})")
80
- st.subheader("Extracted Content (Preview)")
81
- md_path = parsed.get("md_path")
82
- if md_path and os.path.exists(md_path):
83
- md_text = open(md_path, 'r', encoding='utf-8').read()
84
- st.markdown(f"<div class='card'><pre>{md_text[:2000]}{'...' if len(md_text)>2000 else ''}</pre></div>", unsafe_allow_html=True)
 
 
 
 
 
 
 
85
 
86
  # --- Center Column: Q&A ---
87
  with col2:
88
  st.header("2. Ask a Question")
89
  if parsed:
90
- question = st.text_input("Type your question here:", placeholder="E.g., 'What was the Q2 revenue?'" )
91
- if st.button("Get Answer") and question:
92
- with st.spinner("Retrieving answer...🤖"):
93
- generator = AnswerGenerator()
94
- answer, supporting_chunks = generator.answer(parsed['chunks'], question)
95
- st.markdown(f"<div class='card'><h3>Answer</h3><p>{answer}</p></div>", unsafe_allow_html=True)
96
- st.markdown("<div class='card'><h4>Supporting Context</h4></div>", unsafe_allow_html=True)
97
- for sc in supporting_chunks:
98
- st.write(f"- {sc['narration']}")
 
 
 
 
 
 
99
  else:
100
  st.info("Upload and parse a document to ask questions.")
101
 
@@ -103,18 +128,24 @@ with col2:
103
  with col3:
104
  st.header("3. Relevant Chunks")
105
  if parsed:
106
- chunks = parsed.get('chunks', [])
107
- for idx, chunk in enumerate(chunks):
108
- with st.expander(f"Chunk {idx} - {chunk['type'].capitalize()}"):
109
- st.write(chunk.get('narration', ''))
110
- if 'table_structure' in chunk:
111
- st.write("**Parsed Table:**")
112
- st.table(chunk['table_structure'])
113
- for blk in chunk.get('blocks', []):
114
- if blk.get('type') == 'img_path':
115
- img_path = os.path.join(parsed['images_dir'], blk.get('img_path',''))
116
- if os.path.exists(img_path):
117
- st.image(img_path, caption=os.path.basename(img_path))
118
- st.info(f"Total chunks: {len(chunks)}")
 
 
 
 
 
 
119
  else:
120
  st.info("No chunks to display. Parse a document first.")
 
1
  import os
2
  import streamlit as st
3
  from datetime import datetime
4
+ import re
5
+ from werkzeug.utils import secure_filename
6
 
7
  from src.gpp import GPP, GPPConfig
8
  from src.qa import AnswerGenerator
 
62
  st.header("1. Upload & Layout")
63
  uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
64
  if uploaded_file:
65
+ try:
66
+ filename = secure_filename(uploaded_file.name)
67
+ if not re.match(r'^[\w\-. ]+$', filename):
68
+ st.error("Invalid file name.")
69
+ elif st.button("Parse Document"):
70
+ output_dir = os.path.join("./parsed", filename)
71
+ os.makedirs(output_dir, exist_ok=True)
72
+ pdf_path = os.path.join(output_dir, filename)
73
+ with open(pdf_path, "wb") as f:
74
+ f.write(uploaded_file.getbuffer())
75
+ with st.spinner("Parsing document with MinerU and LLM...⏳"):
76
+ try:
77
+ gpp = GPP(GPPConfig())
78
+ parsed = gpp.run(pdf_path, output_dir)
79
+ st.success("✅ Parsing complete!")
80
+ st.session_state.parsed = parsed
81
+ except Exception as e:
82
+ st.error(f"Parsing failed: {e}")
83
+ st.session_state.parsed = None
84
+ except Exception as e:
85
+ st.error(f"File upload failed: {e}")
86
  parsed = st.session_state.parsed
87
  if parsed:
88
+ try:
89
+ st.subheader("Layout Preview")
90
+ layout_pdf = parsed.get("layout_pdf")
91
+ if layout_pdf and os.path.exists(layout_pdf):
92
+ st.markdown(f"[Open Layout PDF]({layout_pdf})")
93
+ st.subheader("Extracted Content (Preview)")
94
+ md_path = parsed.get("md_path")
95
+ if md_path and os.path.exists(md_path):
96
+ try:
97
+ with open(md_path, 'r', encoding='utf-8') as md_file:
98
+ md_text = md_file.read()
99
+ st.markdown(f"<div class='card'><pre>{md_text[:2000]}{'...' if len(md_text)>2000 else ''}</pre></div>", unsafe_allow_html=True)
100
+ except Exception as e:
101
+ st.error(f"Error reading markdown: {e}")
102
+ except Exception as e:
103
+ st.error(f"Error displaying preview: {e}")
104
 
105
  # --- Center Column: Q&A ---
106
  with col2:
107
  st.header("2. Ask a Question")
108
  if parsed:
109
+ try:
110
+ question = st.text_input("Type your question here:", placeholder="E.g., 'What was the Q2 revenue?'" )
111
+ if st.button("Get Answer") and question:
112
+ with st.spinner("Retrieving answer...🤖"):
113
+ try:
114
+ generator = AnswerGenerator()
115
+ answer, supporting_chunks = generator.answer(parsed['chunks'], question)
116
+ st.markdown(f"<div class='card'><h3>Answer</h3><p>{answer}</p></div>", unsafe_allow_html=True)
117
+ st.markdown("<div class='card'><h4>Supporting Context</h4></div>", unsafe_allow_html=True)
118
+ for sc in supporting_chunks:
119
+ st.write(f"- {sc['narration']}")
120
+ except Exception as e:
121
+ st.error(f"Failed to generate answer: {e}")
122
+ except Exception as e:
123
+ st.error(f"Error in Q&A section: {e}")
124
  else:
125
  st.info("Upload and parse a document to ask questions.")
126
 
 
128
  with col3:
129
  st.header("3. Relevant Chunks")
130
  if parsed:
131
+ try:
132
+ chunks = parsed.get('chunks', [])
133
+ for idx, chunk in enumerate(chunks):
134
+ with st.expander(f"Chunk {idx} - {chunk['type'].capitalize()}"):
135
+ try:
136
+ st.write(chunk.get('narration', ''))
137
+ if 'table_structure' in chunk:
138
+ st.write("**Parsed Table:**")
139
+ st.table(chunk['table_structure'])
140
+ for blk in chunk.get('blocks', []):
141
+ if blk.get('type') == 'img_path':
142
+ img_path = os.path.join(parsed['images_dir'], blk.get('img_path',''))
143
+ if os.path.exists(img_path):
144
+ st.image(img_path, caption=os.path.basename(img_path))
145
+ except Exception as e:
146
+ st.error(f"Error displaying chunk: {e}")
147
+ st.info(f"Total chunks: {len(chunks)}")
148
+ except Exception as e:
149
+ st.error(f"Error displaying chunks: {e}")
150
  else:
151
  st.info("No chunks to display. Parse a document first.")
requirements.txt CHANGED
@@ -4,9 +4,15 @@ mineru>=0.1.0
4
  sentence-transformers>=2.2.2
5
  rank-bm25>=0.2.2
6
  redis>=4.5.1
 
7
  transformers>=4.29.2
8
  torch>=2.0.0
9
  openai>=0.27.0
10
  huggingface-hub>=0.16.4
11
- # For testing
12
- pytest>=7.0
 
 
 
 
 
 
4
  sentence-transformers>=2.2.2
5
  rank-bm25>=0.2.2
6
  redis>=4.5.1
7
+ hnswlib>=0.7.0
8
  transformers>=4.29.2
9
  torch>=2.0.0
10
  openai>=0.27.0
11
  huggingface-hub>=0.16.4
12
+ langchain>=0.1.9
13
+ python-dotenv>=1.0.0
14
+ structlog>=23.1.0
15
+ bleach>=6.0.0
16
+
17
+ # Testing
18
+ pytest>=7.0
src/__init__.py CHANGED
@@ -19,7 +19,8 @@ def configure_logging():
19
  wrapper_class=structlog.stdlib.BoundLogger,
20
  cache_logger_on_first_use=True,
21
  )
22
- logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
23
 
24
  def get_env(name):
25
  val = os.getenv(name)
 
19
  wrapper_class=structlog.stdlib.BoundLogger,
20
  cache_logger_on_first_use=True,
21
  )
22
+ if not logging.getLogger().handlers:
23
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO)
24
 
25
  def get_env(name):
26
  val = os.getenv(name)
src/config.py CHANGED
@@ -15,7 +15,9 @@ class EmbeddingConfig:
15
  META_MODEL = os.getenv('META_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
16
 
17
  class RetrieverConfig:
18
- TOP_K = int(os.getenv('RETRIEVER_TOP_K', 10))
 
 
19
 
20
  class RerankerConfig:
21
  MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')
 
15
  META_MODEL = os.getenv('META_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
16
 
17
  class RetrieverConfig:
18
+ TOP_K = int(os.getenv('RETRIEVER_TOP_K', 10)) # number of candidates per retrieval path
19
+ DENSE_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
20
+ ANN_TOP = int(os.getenv('ANN_TOP', 50))
21
 
22
  class RerankerConfig:
23
  MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')
src/gpp.py CHANGED
@@ -8,7 +8,7 @@ This module handles:
8
  4. Parsing markdown tables into JSON 2D structures for dense tables
9
  5. Narration of tables/images via LLM
10
  6. Semantic enhancements (deduplication, coreference, metadata summarization)
11
- 7. Embedding computation and storage in Redis & BM25
12
 
13
  Each step is modular to support swapping components (e.g. different parsers or stores).
14
  """
@@ -27,7 +27,6 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
27
  from sentence_transformers import SentenceTransformer
28
  from rank_bm25 import BM25Okapi
29
  import numpy as np
30
- import redis
31
 
32
  # LLM client abstraction
33
  from src.utils import LLMClient
@@ -71,22 +70,12 @@ class GPPConfig:
71
  TEXT_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
72
  META_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
73
 
74
- # Redis settings
75
- REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
76
- REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
77
- REDIS_DB = int(os.getenv('REDIS_DB', 0))
78
- REDIS_VECTOR_INDEX = 'gpp_vectors'
79
-
80
  class GPP:
81
  def __init__(self, config: GPPConfig):
82
  self.config = config
83
  # Embedding models
84
  self.text_embedder = SentenceTransformer(config.TEXT_EMBED_MODEL)
85
  self.meta_embedder = SentenceTransformer(config.META_EMBED_MODEL)
86
- # Redis for vectors + metadata
87
- self.redis = redis.Redis(host=config.REDIS_HOST,
88
- port=config.REDIS_PORT,
89
- db=config.REDIS_DB)
90
  self.bm25 = None
91
 
92
  def parse_pdf(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
@@ -179,42 +168,43 @@ class GPP:
179
  c['narration'] = c['text']
180
 
181
  def deduplicate(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
182
- """
183
- Drops near-duplicate narrations via cosine sim > threshold.
184
- """
185
- embs = self.text_embedder.encode([c['narration'] for c in chunks], convert_to_tensor=True)
186
- keep = []
187
- for i, emb in enumerate(embs):
188
- if not any((emb @ embs[j]).item() / (np.linalg.norm(emb) * np.linalg.norm(embs[j]))
189
- > self.config.DEDUP_SIM_THRESHOLD for j in keep):
190
- keep.append(i)
191
- deduped = [chunks[i] for i in keep]
192
- logger.info(f"Deduplicated: {len(chunks)}→{len(deduped)}")
193
- return deduped
 
194
 
195
  def coref_resolution(self, chunks: List[Dict[str, Any]]) -> None:
196
- """
197
- Resolve pronouns using preceding context via LLM.
198
- """
199
  for idx, c in enumerate(chunks):
200
  start = max(0, idx-self.config.COREF_CONTEXT_SIZE)
201
- ctx = "\n".join(chunks[i]['narration'] for i in range(start, idx))
202
- prompt = f"Context:\n{ctx}\nRewrite pronouns in:\n{c['narration']}"
203
- c['narration'] = LLMClient.generate(prompt)
 
 
 
204
 
205
  def metadata_summarization(self, chunks: List[Dict[str, Any]]) -> None:
206
- """
207
- Summarize sections and attach to metadata for self-contained context.
208
- """
209
  sections: Dict[str, List[Dict[str, Any]]] = {}
210
  for c in chunks:
211
  sec = c.get('section', 'default')
212
  sections.setdefault(sec, []).append(c)
213
  for sec, items in sections.items():
214
- blob = "\n".join(i['narration'] for i in items)
215
- summ = LLMClient.generate(f"Summarize this section:\n{blob}")
216
- for i in items:
217
- i.setdefault('metadata', {})['section_summary'] = summ
 
 
 
218
 
219
  def build_bm25(self, chunks: List[Dict[str, Any]]) -> None:
220
  """
@@ -223,31 +213,20 @@ class GPP:
223
  tokenized = [c['narration'].split() for c in chunks]
224
  self.bm25 = BM25Okapi(tokenized)
225
 
226
- def compute_and_store(self, chunks: List[Dict[str, Any]]) -> None:
227
- """
228
- Encode narrations & metadata, store vectors and chunk metadata in Redis.
229
- """
230
- txts = [c['narration'] for c in chunks]
231
- metas = [c.get('metadata', {}).get('section_summary', '') for c in chunks]
232
- txt_embs = self.text_embedder.encode(txts)
233
- meta_embs = self.meta_embedder.encode(metas)
234
-
235
- pipe = self.redis.pipeline()
236
- for i, (c, te) in enumerate(zip(chunks, txt_embs)):
237
- key = f"chunk:{i}"
238
- # store metadata
239
- store = {'narration': c['narration'], 'type': c['type']}
240
- if 'table_structure' in c:
241
- store['table_structure'] = json.dumps(c['table_structure'])
242
- pipe.hset(key, mapping=store)
243
- # store dense vector
244
- pipe.hset(self.config.REDIS_VECTOR_INDEX, key, te.tobytes())
245
- pipe.execute()
246
- logger.info("Stored embeddings and metadata in Redis.")
247
 
248
  def run(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
249
  """
250
- Executes full GPP: parse chunk narrate enhance index.
251
  Returns parse output dict augmented with `chunks` for downstream processes.
252
  """
253
  parsed = self.parse_pdf(pdf_path, output_dir)
@@ -258,16 +237,7 @@ class GPP:
258
  self.coref_resolution(chunks)
259
  self.metadata_summarization(chunks)
260
  self.build_bm25(chunks)
261
- self.compute_and_store(chunks)
262
  parsed['chunks'] = chunks
263
  logger.info("GPP pipeline complete.")
264
  return parsed
265
-
266
- if __name__ == '__main__':
267
- import argparse
268
- parser = argparse.ArgumentParser()
269
- parser.add_argument('pdf')
270
- parser.add_argument('outdir')
271
- args = parser.parse_args()
272
- gpp = GPP(GPPConfig())
273
- gpp.run(args.pdf, args.outdir)
 
8
  4. Parsing markdown tables into JSON 2D structures for dense tables
9
  5. Narration of tables/images via LLM
10
  6. Semantic enhancements (deduplication, coreference, metadata summarization)
11
+ 7. Embedding computation for in-memory use
12
 
13
  Each step is modular to support swapping components (e.g. different parsers or stores).
14
  """
 
27
  from sentence_transformers import SentenceTransformer
28
  from rank_bm25 import BM25Okapi
29
  import numpy as np
 
30
 
31
  # LLM client abstraction
32
  from src.utils import LLMClient
 
70
  TEXT_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
71
  META_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
72
 
 
 
 
 
 
 
73
  class GPP:
74
  def __init__(self, config: GPPConfig):
75
  self.config = config
76
  # Embedding models
77
  self.text_embedder = SentenceTransformer(config.TEXT_EMBED_MODEL)
78
  self.meta_embedder = SentenceTransformer(config.META_EMBED_MODEL)
 
 
 
 
79
  self.bm25 = None
80
 
81
  def parse_pdf(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
 
168
  c['narration'] = c['text']
169
 
170
  def deduplicate(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
171
+ try:
172
+ embs = self.text_embedder.encode([c.get('narration', '') for c in chunks], convert_to_tensor=True)
173
+ keep = []
174
+ for i, emb in enumerate(embs):
175
+ if not any((emb @ embs[j]).item() / (np.linalg.norm(emb) * np.linalg.norm(embs[j]) + 1e-8)
176
+ > self.config.DEDUP_SIM_THRESHOLD for j in keep):
177
+ keep.append(i)
178
+ deduped = [chunks[i] for i in keep]
179
+ logger.info(f"Deduplicated: {len(chunks)}→{len(deduped)}")
180
+ return deduped
181
+ except Exception as e:
182
+ logger.error(f"Deduplication failed: {e}")
183
+ return chunks
184
 
185
  def coref_resolution(self, chunks: List[Dict[str, Any]]) -> None:
 
 
 
186
  for idx, c in enumerate(chunks):
187
  start = max(0, idx-self.config.COREF_CONTEXT_SIZE)
188
+ ctx = "\n".join(chunks[i].get('narration', '') for i in range(start, idx))
189
+ prompt = f"Context:\n{ctx}\nRewrite pronouns in:\n{c.get('narration', '')}"
190
+ try:
191
+ c['narration'] = LLMClient.generate(prompt)
192
+ except Exception as e:
193
+ logger.error(f"Coref resolution failed for chunk {idx}: {e}")
194
 
195
  def metadata_summarization(self, chunks: List[Dict[str, Any]]) -> None:
 
 
 
196
  sections: Dict[str, List[Dict[str, Any]]] = {}
197
  for c in chunks:
198
  sec = c.get('section', 'default')
199
  sections.setdefault(sec, []).append(c)
200
  for sec, items in sections.items():
201
+ blob = "\n".join(i.get('narration', '') for i in items)
202
+ try:
203
+ summ = LLMClient.generate(f"Summarize this section:\n{blob}")
204
+ for i in items:
205
+ i.setdefault('metadata', {})['section_summary'] = summ
206
+ except Exception as e:
207
+ logger.error(f"Metadata summarization failed for section {sec}: {e}")
208
 
209
  def build_bm25(self, chunks: List[Dict[str, Any]]) -> None:
210
  """
 
213
  tokenized = [c['narration'].split() for c in chunks]
214
  self.bm25 = BM25Okapi(tokenized)
215
 
216
+ # def compute_and_store(self, chunks: List[Dict[str, Any]]) -> None:
217
+ # try:
218
+ # txts = [c.get('narration', '') for c in chunks]
219
+ # metas = [c.get('metadata', {}).get('section_summary', '') for c in chunks]
220
+ # txt_embs = self.text_embedder.encode(txts)
221
+ # meta_embs = self.meta_embedder.encode(metas)
222
+ # # No Redis storage, just keep for in-memory use or return as needed
223
+ # logger.info("Computed embeddings for chunks.")
224
+ # except Exception as e:
225
+ # logger.error(f"Failed to compute embeddings: {e}")
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  def run(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
228
  """
229
+ Executes full GPP: parse -> chunk -> narrate -> enhance -> index.
230
  Returns parse output dict augmented with `chunks` for downstream processes.
231
  """
232
  parsed = self.parse_pdf(pdf_path, output_dir)
 
237
  self.coref_resolution(chunks)
238
  self.metadata_summarization(chunks)
239
  self.build_bm25(chunks)
240
+ # self.compute_and_store(chunks)
241
  parsed['chunks'] = chunks
242
  logger.info("GPP pipeline complete.")
243
  return parsed
 
 
 
 
 
 
 
 
 
src/qa.py CHANGED
@@ -25,7 +25,7 @@ from src.retriever import Retriever, RetrieverConfig
25
 
26
 
27
  class RerankerConfig:
28
- MODEL_NAME = 'BAAI/bge-reranker-v2-Gemma'
29
  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
30
 
31
  class Reranker:
@@ -33,26 +33,36 @@ class Reranker:
33
  Cross-encoder re-ranker using a transformer-based sequence classification model.
34
  """
35
  def __init__(self, config: RerankerConfig):
36
- self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
37
- self.model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME)
38
- self.model.to(config.DEVICE)
 
 
 
 
39
 
40
  def rerank(self, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]:
41
  """Score each candidate and return top_k sorted by relevance."""
42
- inputs = self.tokenizer(
43
- [query] * len(candidates),
44
- [c['narration'] for c in candidates],
45
- padding=True,
46
- truncation=True,
47
- return_tensors='pt'
48
- ).to(RerankerConfig.DEVICE)
49
- with torch.no_grad():
50
- logits = self.model(**inputs).logits.squeeze(-1)
51
- scores = torch.sigmoid(logits).cpu().numpy()
52
- # pair and sort
53
- paired = list(zip(candidates, scores))
54
- ranked = sorted(paired, key=lambda x: x[1], reverse=True)
55
- return [c for c, _ in ranked[:top_k]]
 
 
 
 
 
 
56
 
57
 
58
  class AnswerGenerator:
@@ -66,23 +76,23 @@ class AnswerGenerator:
66
  def answer(self, chunks: List[Dict[str, Any]], question: str) -> Tuple[str, List[Dict[str, Any]]]:
67
  logger.info('Answering question', question=question)
68
  question = sanitize_html(question)
69
- # 1. Retrieval
70
- retriever = Retriever(chunks, self.ret_config)
71
- candidates = retriever.retrieve(question)
72
- # 2. Re-ranking
73
- reranker = Reranker(self.rerank_config)
74
- top_chunks = reranker.rerank(question, candidates, top_k=5)
75
- # 3. Assemble prompt
76
- context = "\n\n".join([f"- {c['narration']}" for c in top_chunks])
77
- prompt = (
78
- f"You are a knowledgeable assistant. "
79
- f"Use the following extracted document snippets to answer the question."
80
- f"\n\nContext:\n{context}"
81
- f"\n\nQuestion: {question}\nAnswer:"
82
- )
83
- # 4. Generate answer
84
- answer = LLMClient.generate(prompt)
85
- return answer, top_chunks
86
 
87
  # Example usage:
88
  # generator = AnswerGenerator()
 
25
 
26
 
27
  class RerankerConfig:
28
+ MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')
29
  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
30
 
31
  class Reranker:
 
33
  Cross-encoder re-ranker using a transformer-based sequence classification model.
34
  """
35
  def __init__(self, config: RerankerConfig):
36
+ try:
37
+ self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
38
+ self.model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME)
39
+ self.model.to(config.DEVICE)
40
+ except Exception as e:
41
+ logger.error(f'Failed to load reranker model: {e}')
42
+ raise
43
 
44
  def rerank(self, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]:
45
  """Score each candidate and return top_k sorted by relevance."""
46
+ if not candidates:
47
+ logger.warning('No candidates provided to rerank.')
48
+ return []
49
+ try:
50
+ inputs = self.tokenizer(
51
+ [query] * len(candidates),
52
+ [c.get('narration', '') for c in candidates],
53
+ padding=True,
54
+ truncation=True,
55
+ return_tensors='pt'
56
+ ).to(RerankerConfig.DEVICE)
57
+ with torch.no_grad():
58
+ logits = self.model(**inputs).logits.squeeze(-1)
59
+ scores = torch.sigmoid(logits).cpu().numpy()
60
+ paired = list(zip(candidates, scores))
61
+ ranked = sorted(paired, key=lambda x: x[1], reverse=True)
62
+ return [c for c, _ in ranked[:top_k]]
63
+ except Exception as e:
64
+ logger.error(f'Reranking failed: {e}')
65
+ return candidates[:top_k]
66
 
67
 
68
  class AnswerGenerator:
 
76
  def answer(self, chunks: List[Dict[str, Any]], question: str) -> Tuple[str, List[Dict[str, Any]]]:
77
  logger.info('Answering question', question=question)
78
  question = sanitize_html(question)
79
+ try:
80
+ retriever = Retriever(chunks, self.ret_config)
81
+ candidates = retriever.retrieve(question)
82
+ reranker = Reranker(self.rerank_config)
83
+ top_chunks = reranker.rerank(question, candidates, top_k=5)
84
+ context = "\n\n".join([f"- {c.get('narration', '')}" for c in top_chunks])
85
+ prompt = (
86
+ f"You are a knowledgeable assistant. "
87
+ f"Use the following extracted document snippets to answer the question."
88
+ f"\n\nContext:\n{context}"
89
+ f"\n\nQuestion: {question}\nAnswer:"
90
+ )
91
+ answer = LLMClient.generate(prompt)
92
+ return answer, top_chunks
93
+ except Exception as e:
94
+ logger.error(f'Failed to answer question: {e}')
95
+ return "Failed to generate answer due to error.", []
96
 
97
  # Example usage:
98
  # generator = AnswerGenerator()
src/retriever.py CHANGED
@@ -1,64 +1,107 @@
1
  import os
2
  import numpy as np
3
- import redis
4
  import hnswlib
5
  from typing import List, Dict, Any
6
 
7
  from sentence_transformers import SentenceTransformer
8
  from rank_bm25 import BM25Okapi
9
 
10
- class RetrieverConfig:
11
- TOP_K = 10 # number of candidates per retrieval path
12
- DENSE_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
13
- REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
14
- REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
15
- REDIS_DB = int(os.getenv('REDIS_DB', 0))
16
- REDIS_VECTOR_INDEX = 'gpp_vectors'
17
 
18
  class Retriever:
19
  """
20
- Hybrid retriever combining BM25 sparse and Redis-based dense retrieval.
21
  """
22
  def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
23
- self.chunks = chunks
24
- # Build BM25 index over chunk narrations
25
- corpus = [c['narration'].split() for c in chunks]
26
- self.bm25 = BM25Okapi(corpus)
27
- # Load dense embedder
28
- self.embedder = SentenceTransformer(config.DENSE_MODEL)
29
- # Connect to Redis for vector store
30
- self.redis = redis.Redis(host=config.REDIS_HOST,
31
- port=config.REDIS_PORT,
32
- db=config.REDIS_DB)
33
- self.vector_index = config.REDIS_VECTOR_INDEX
34
 
35
- # Build HNSW index
36
- dim = len(self.embedder.encode(["test"])[0])
37
- self.ann = hnswlib.Index(space='cosine', dim=dim)
38
- self.ann.init_index(max_elements=len(chunks), ef_construction=200, M=16)
39
- embeddings = self.embedder.encode([c['narration'] for c in chunks])
40
- self.ann.add_items(embeddings, ids=list(range(len(chunks))))
41
- self.ann.set_ef(50) # ef should be > top_k for accuracy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def retrieve_sparse(self, query: str, top_k: int) -> List[Dict[str, Any]]:
44
- """Return top_k chunks by BM25 score."""
 
 
 
 
 
 
 
 
 
 
 
 
45
  tokenized = query.split()
46
- scores = self.bm25.get_scores(tokenized)
47
- top_indices = np.argsort(scores)[::-1][:top_k]
48
- return [self.chunks[i] for i in top_indices]
 
 
 
 
49
 
50
  def retrieve_dense(self, query: str, top_k: int) -> List[Dict[str, Any]]:
51
- """Return top_k chunks by dense cosine similarity via Redis vectors."""
52
- # Embed query
53
- q_emb = self.embedder.encode([query])[0]
54
- labels, distances = self.ann.knn_query(q_emb, k=top_k)
55
- return [self.chunks[i] for i in labels[0]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- def retrieve(self, query: str, top_k: int = RetrieverConfig.TOP_K) -> List[Dict[str, Any]]:
58
- """Combine sparse + dense results (unique) into candidate pool."""
 
 
 
59
  sparse = self.retrieve_sparse(query, top_k)
60
  dense = self.retrieve_dense(query, top_k)
61
- # Union while preserving order
62
  seen = set()
63
  combined = []
64
  for c in sparse + dense:
 
1
  import os
2
  import numpy as np
 
3
  import hnswlib
4
  from typing import List, Dict, Any
5
 
6
  from sentence_transformers import SentenceTransformer
7
  from rank_bm25 import BM25Okapi
8
 
9
+ from src.config import RetrieverConfig
10
+ from src import logger
11
+
 
 
 
 
12
 
13
  class Retriever:
14
  """
15
+ Hybrid retriever combining BM25 sparse and dense retrieval (no Redis).
16
  """
17
  def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
18
+ """
19
+ Initialize the retriever with chunks and configuration.
 
 
 
 
 
 
 
 
 
20
 
21
+ Args:
22
+ chunks (List[Dict[str, Any]]): List of chunks, where each chunk is a dictionary.
23
+ config (RetrieverConfig): Configuration for the retriever.
24
+ """
25
+ self.chunks = chunks
26
+ try:
27
+ if not isinstance(chunks, list) or not all(isinstance(c, dict) for c in chunks):
28
+ logger.error("Chunks must be a list of dicts.")
29
+ raise ValueError("Chunks must be a list of dicts.")
30
+ corpus = [c.get('narration', '').split() for c in chunks]
31
+ self.bm25 = BM25Okapi(corpus)
32
+ self.embedder = SentenceTransformer(config.DENSE_MODEL)
33
+ dim = len(self.embedder.encode(["test"])[0])
34
+ self.ann = hnswlib.Index(space='cosine', dim=dim)
35
+ self.ann.init_index(max_elements=len(chunks))
36
+ embeddings = self.embedder.encode([c.get('narration', '') for c in chunks])
37
+ self.ann.add_items(embeddings, ids=list(range(len(chunks))))
38
+ self.ann.set_ef(config.ANN_TOP)
39
+ except Exception as e:
40
+ logger.error(f"Retriever init failed: {e}")
41
+ self.bm25 = None
42
+ self.embedder = None
43
+ self.ann = None
44
 
45
  def retrieve_sparse(self, query: str, top_k: int) -> List[Dict[str, Any]]:
46
+ """
47
+ Retrieve chunks using BM25 sparse retrieval.
48
+
49
+ Args:
50
+ query (str): Query string.
51
+ top_k (int): Number of top chunks to return.
52
+
53
+ Returns:
54
+ List[Dict[str, Any]]: List of top chunks.
55
+ """
56
+ if not self.bm25:
57
+ logger.error("BM25 not initialized.")
58
+ return []
59
  tokenized = query.split()
60
+ try:
61
+ scores = self.bm25.get_scores(tokenized)
62
+ top_indices = np.argsort(scores)[::-1][:top_k]
63
+ return [self.chunks[i] for i in top_indices]
64
+ except Exception as e:
65
+ logger.error(f"Sparse retrieval failed: {e}")
66
+ return []
67
 
68
  def retrieve_dense(self, query: str, top_k: int) -> List[Dict[str, Any]]:
69
+ """
70
+ Retrieve chunks using dense retrieval.
71
+
72
+ Args:
73
+ query (str): Query string.
74
+ top_k (int): Number of top chunks to return.
75
+
76
+ Returns:
77
+ List[Dict[str, Any]]: List of top chunks.
78
+ """
79
+ if not self.ann or not self.embedder:
80
+ logger.error("Dense retriever not initialized.")
81
+ return []
82
+ try:
83
+ q_emb = self.embedder.encode([query])[0]
84
+ labels, distances = self.ann.knn_query(q_emb, k=top_k)
85
+ return [self.chunks[i] for i in labels[0]]
86
+ except Exception as e:
87
+ logger.error(f"Dense retrieval failed: {e}")
88
+ return []
89
+
90
+ def retrieve(self, query: str, top_k: int = None) -> List[Dict[str, Any]]:
91
+ """
92
+ Retrieve chunks using hybrid retrieval.
93
+
94
+ Args:
95
+ query (str): Query string.
96
+ top_k (int, optional): Number of top chunks to return. Defaults to None.
97
 
98
+ Returns:
99
+ List[Dict[str, Any]]: List of top chunks.
100
+ """
101
+ if top_k is None:
102
+ top_k = RetrieverConfig.TOP_K
103
  sparse = self.retrieve_sparse(query, top_k)
104
  dense = self.retrieve_dense(query, top_k)
 
105
  seen = set()
106
  combined = []
107
  for c in sparse + dense:
src/utils.py CHANGED
@@ -3,27 +3,13 @@ Utilities module: LLM client wrapper and shared helpers.
3
  """
4
  import os
5
  import openai
6
- import logging
7
- import sys
8
- import structlog
9
 
10
-
11
- def configure_logging():
12
- structlog.configure(
13
- processors=[
14
- structlog.processors.TimeStamper(fmt="iso"),
15
- structlog.processors.JSONRenderer()
16
- ],
17
- context_class=dict,
18
- logger_factory=structlog.stdlib.LoggerFactory(),
19
- wrapper_class=structlog.stdlib.BoundLogger,
20
- cache_logger_on_first_use=True,
21
- )
22
- logging.basicConfig(stream=sys.stdout, level=logging.INFO)
23
-
24
-
25
- configure_logging()
26
- logger = structlog.get_logger()
27
 
28
  class LLMClient:
29
  """
@@ -32,15 +18,22 @@ class LLMClient:
32
  """
33
  @staticmethod
34
  def generate(prompt: str, model: str = None, max_tokens: int = 512, **kwargs) -> str:
35
- api_key = os.getenv('OPENAI_API_KEY')
36
- if not api_key:
 
 
 
 
37
  logger.error('OPENAI_API_KEY is not set')
38
  raise EnvironmentError('Missing OPENAI_API_KEY')
39
- openai.api_key = api_key
40
- model_name = model or os.getenv('OPENAI_MODEL', 'gpt-4o')
 
 
 
41
  try:
42
- resp = openai.ChatCompletion.create(
43
- model=model_name,
44
  messages=[{"role": "system", "content": "You are a helpful assistant."},
45
  {"role": "user", "content": prompt}],
46
  max_tokens=max_tokens,
@@ -49,7 +42,9 @@ class LLMClient:
49
  )
50
  text = resp.choices[0].message.content.strip()
51
  return text
 
 
 
52
  except Exception as e:
53
  logger.exception('LLM generation failed')
54
  raise
55
-
 
3
  """
4
  import os
5
  import openai
6
+ from openai import AzureOpenAI, error
 
 
7
 
8
+ try:
9
+ from src.utils import logger
10
+ except ImportError:
11
+ import structlog
12
+ logger = structlog.get_logger()
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  class LLMClient:
15
  """
 
18
  """
19
  @staticmethod
20
  def generate(prompt: str, model: str = None, max_tokens: int = 512, **kwargs) -> str:
21
+ azure_api_key = os.getenv('AZURE_API_KEY')
22
+ azure_endpoint = os.getenv('AZURE_ENDPOINT')
23
+ azure_api_version = os.getenv('AZURE_API_VERSION')
24
+ openai_model_name = model or os.getenv('OPENAI_MODEL', 'gpt-4o')
25
+
26
+ if not (azure_api_key or azure_endpoint or azure_api_version or openai_model_name):
27
  logger.error('OPENAI_API_KEY is not set')
28
  raise EnvironmentError('Missing OPENAI_API_KEY')
29
+ client = AzureOpenAI(
30
+ api_key=azure_api_key,
31
+ azure_endpoint=azure_endpoint,
32
+ api_version=azure_api_version
33
+ )
34
  try:
35
+ resp = client.ChatCompletion.create(
36
+ model=openai_model_name,
37
  messages=[{"role": "system", "content": "You are a helpful assistant."},
38
  {"role": "user", "content": prompt}],
39
  max_tokens=max_tokens,
 
42
  )
43
  text = resp.choices[0].message.content.strip()
44
  return text
45
+ except openai.error.OpenAIError as oe:
46
+ logger.error(f'OpenAI API error: {oe}')
47
+ raise
48
  except Exception as e:
49
  logger.exception('LLM generation failed')
50
  raise