Spaces:
Build error
Build error
Abhinav Gavireddi
commited on
Commit
·
33f4e34
1
Parent(s):
3301b3c
fix: removed redis to store embeddings in memory
Browse files- .github/workflows/ci.yaml +30 -13
- Dockerfile +8 -6
- src/app.py → app.py +73 -42
- requirements.txt +8 -2
- src/__init__.py +2 -1
- src/config.py +3 -1
- src/gpp.py +39 -69
- src/qa.py +45 -35
- src/retriever.py +82 -39
- src/utils.py +22 -27
.github/workflows/ci.yaml
CHANGED
@@ -8,29 +8,46 @@ on:
|
|
8 |
|
9 |
jobs:
|
10 |
build-and-test:
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
deploy-to-hf:
|
14 |
needs: build-and-test
|
15 |
runs-on: ubuntu-latest
|
16 |
-
if: github.ref == 'refs/heads/main'
|
17 |
steps:
|
18 |
- name: Checkout repo
|
19 |
uses: actions/checkout@v3
|
20 |
with:
|
21 |
-
fetch-depth: 0
|
22 |
|
23 |
-
- name: Set up Docker
|
24 |
-
|
25 |
|
26 |
-
- name:
|
27 |
-
run:
|
28 |
|
29 |
-
- name:
|
30 |
-
run:
|
31 |
-
|
|
|
|
|
32 |
|
33 |
-
|
|
|
34 |
run: |
|
35 |
-
|
36 |
-
|
|
|
8 |
|
9 |
jobs:
|
10 |
build-and-test:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- name: Checkout repo
|
14 |
+
uses: actions/checkout@v3
|
15 |
+
- name: Set up Python
|
16 |
+
uses: actions/setup-python@v4
|
17 |
+
with:
|
18 |
+
python-version: '3.10'
|
19 |
+
- name: Install dependencies
|
20 |
+
run: |
|
21 |
+
python -m pip install --upgrade pip
|
22 |
+
pip install -r requirements.txt
|
23 |
+
- name: Run tests
|
24 |
+
run: |
|
25 |
+
if [ -f tests/test.py ]; then python -m unittest discover -s tests; fi
|
26 |
|
27 |
deploy-to-hf:
|
28 |
needs: build-and-test
|
29 |
runs-on: ubuntu-latest
|
30 |
+
if: github.ref == 'refs/heads/main'
|
31 |
steps:
|
32 |
- name: Checkout repo
|
33 |
uses: actions/checkout@v3
|
34 |
with:
|
35 |
+
fetch-depth: 0
|
36 |
|
37 |
+
- name: Set up Docker Buildx
|
38 |
+
uses: docker/setup-buildx-action@v3
|
39 |
|
40 |
+
- name: Log in to Hugging Face Docker
|
41 |
+
run: echo "${{ secrets.HF_TOKEN }}" | docker login --username ${{ secrets.HF_USERNAME }} --password-stdin docker.io
|
42 |
|
43 |
+
- name: Build Docker image
|
44 |
+
run: docker build -t docker.io/${{ secrets.HF_USERNAME }}/${{ secrets.HF_SPACE_NAME }}:latest .
|
45 |
+
|
46 |
+
- name: Push Docker image to Hugging Face
|
47 |
+
run: docker push docker.io/${{ secrets.HF_USERNAME }}/${{ secrets.HF_SPACE_NAME }}:latest
|
48 |
|
49 |
+
# Optionally, trigger a Space restart via the API (not strictly required)
|
50 |
+
- name: Restart Hugging Face Space
|
51 |
run: |
|
52 |
+
pip install huggingface_hub
|
53 |
+
python -c "from huggingface_hub import HfApi; HfApi(token='${{ secrets.HF_TOKEN }}').restart_space('${{ secrets.HF_USERNAME }}', '${{ secrets.HF_SPACE_NAME }}')"
|
Dockerfile
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
# Base image
|
2 |
FROM python:3.10-slim
|
3 |
|
|
|
|
|
|
|
4 |
# Set working directory
|
5 |
WORKDIR /app
|
6 |
|
@@ -9,7 +12,9 @@ RUN apt-get update && \
|
|
9 |
apt-get install -y --no-install-recommends \
|
10 |
build-essential \
|
11 |
ffmpeg \
|
12 |
-
|
|
|
|
|
13 |
&& rm -rf /var/lib/apt/lists/*
|
14 |
|
15 |
# Copy and install Python dependencies
|
@@ -21,15 +26,12 @@ COPY src/ ./src/
|
|
21 |
COPY tests/ ./tests/
|
22 |
COPY app.py .
|
23 |
|
24 |
-
# Copy env file if you want local dev (optional)
|
25 |
-
# COPY .env .env
|
26 |
-
|
27 |
# Expose Streamlit port
|
28 |
-
EXPOSE
|
29 |
|
30 |
# Set environment variables
|
31 |
ENV PYTHONUNBUFFERED=1
|
32 |
ENV TOKENIZERS_PARALLELISM=false
|
33 |
|
34 |
# Start Streamlit
|
35 |
-
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=
|
|
|
1 |
# Base image
|
2 |
FROM python:3.10-slim
|
3 |
|
4 |
+
RUN useradd -m -u 1000 user
|
5 |
+
USER user
|
6 |
+
|
7 |
# Set working directory
|
8 |
WORKDIR /app
|
9 |
|
|
|
12 |
apt-get install -y --no-install-recommends \
|
13 |
build-essential \
|
14 |
ffmpeg \
|
15 |
+
|
16 |
+
# for hnswlib (needed for OpenMP)
|
17 |
+
libgomp1 \
|
18 |
&& rm -rf /var/lib/apt/lists/*
|
19 |
|
20 |
# Copy and install Python dependencies
|
|
|
26 |
COPY tests/ ./tests/
|
27 |
COPY app.py .
|
28 |
|
|
|
|
|
|
|
29 |
# Expose Streamlit port
|
30 |
+
EXPOSE 7860
|
31 |
|
32 |
# Set environment variables
|
33 |
ENV PYTHONUNBUFFERED=1
|
34 |
ENV TOKENIZERS_PARALLELISM=false
|
35 |
|
36 |
# Start Streamlit
|
37 |
+
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
src/app.py → app.py
RENAMED
@@ -1,6 +1,8 @@
|
|
1 |
import os
|
2 |
import streamlit as st
|
3 |
from datetime import datetime
|
|
|
|
|
4 |
|
5 |
from src.gpp import GPP, GPPConfig
|
6 |
from src.qa import AnswerGenerator
|
@@ -60,42 +62,65 @@ with col1:
|
|
60 |
st.header("1. Upload & Layout")
|
61 |
uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
|
62 |
if uploaded_file:
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
parsed = st.session_state.parsed
|
75 |
if parsed:
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
# --- Center Column: Q&A ---
|
87 |
with col2:
|
88 |
st.header("2. Ask a Question")
|
89 |
if parsed:
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
else:
|
100 |
st.info("Upload and parse a document to ask questions.")
|
101 |
|
@@ -103,18 +128,24 @@ with col2:
|
|
103 |
with col3:
|
104 |
st.header("3. Relevant Chunks")
|
105 |
if parsed:
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
st.
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
else:
|
120 |
st.info("No chunks to display. Parse a document first.")
|
|
|
1 |
import os
|
2 |
import streamlit as st
|
3 |
from datetime import datetime
|
4 |
+
import re
|
5 |
+
from werkzeug.utils import secure_filename
|
6 |
|
7 |
from src.gpp import GPP, GPPConfig
|
8 |
from src.qa import AnswerGenerator
|
|
|
62 |
st.header("1. Upload & Layout")
|
63 |
uploaded_file = st.file_uploader("Select a PDF document", type=["pdf"], help="Supported: PDF files")
|
64 |
if uploaded_file:
|
65 |
+
try:
|
66 |
+
filename = secure_filename(uploaded_file.name)
|
67 |
+
if not re.match(r'^[\w\-. ]+$', filename):
|
68 |
+
st.error("Invalid file name.")
|
69 |
+
elif st.button("Parse Document"):
|
70 |
+
output_dir = os.path.join("./parsed", filename)
|
71 |
+
os.makedirs(output_dir, exist_ok=True)
|
72 |
+
pdf_path = os.path.join(output_dir, filename)
|
73 |
+
with open(pdf_path, "wb") as f:
|
74 |
+
f.write(uploaded_file.getbuffer())
|
75 |
+
with st.spinner("Parsing document with MinerU and LLM...⏳"):
|
76 |
+
try:
|
77 |
+
gpp = GPP(GPPConfig())
|
78 |
+
parsed = gpp.run(pdf_path, output_dir)
|
79 |
+
st.success("✅ Parsing complete!")
|
80 |
+
st.session_state.parsed = parsed
|
81 |
+
except Exception as e:
|
82 |
+
st.error(f"Parsing failed: {e}")
|
83 |
+
st.session_state.parsed = None
|
84 |
+
except Exception as e:
|
85 |
+
st.error(f"File upload failed: {e}")
|
86 |
parsed = st.session_state.parsed
|
87 |
if parsed:
|
88 |
+
try:
|
89 |
+
st.subheader("Layout Preview")
|
90 |
+
layout_pdf = parsed.get("layout_pdf")
|
91 |
+
if layout_pdf and os.path.exists(layout_pdf):
|
92 |
+
st.markdown(f"[Open Layout PDF]({layout_pdf})")
|
93 |
+
st.subheader("Extracted Content (Preview)")
|
94 |
+
md_path = parsed.get("md_path")
|
95 |
+
if md_path and os.path.exists(md_path):
|
96 |
+
try:
|
97 |
+
with open(md_path, 'r', encoding='utf-8') as md_file:
|
98 |
+
md_text = md_file.read()
|
99 |
+
st.markdown(f"<div class='card'><pre>{md_text[:2000]}{'...' if len(md_text)>2000 else ''}</pre></div>", unsafe_allow_html=True)
|
100 |
+
except Exception as e:
|
101 |
+
st.error(f"Error reading markdown: {e}")
|
102 |
+
except Exception as e:
|
103 |
+
st.error(f"Error displaying preview: {e}")
|
104 |
|
105 |
# --- Center Column: Q&A ---
|
106 |
with col2:
|
107 |
st.header("2. Ask a Question")
|
108 |
if parsed:
|
109 |
+
try:
|
110 |
+
question = st.text_input("Type your question here:", placeholder="E.g., 'What was the Q2 revenue?'" )
|
111 |
+
if st.button("Get Answer") and question:
|
112 |
+
with st.spinner("Retrieving answer...🤖"):
|
113 |
+
try:
|
114 |
+
generator = AnswerGenerator()
|
115 |
+
answer, supporting_chunks = generator.answer(parsed['chunks'], question)
|
116 |
+
st.markdown(f"<div class='card'><h3>Answer</h3><p>{answer}</p></div>", unsafe_allow_html=True)
|
117 |
+
st.markdown("<div class='card'><h4>Supporting Context</h4></div>", unsafe_allow_html=True)
|
118 |
+
for sc in supporting_chunks:
|
119 |
+
st.write(f"- {sc['narration']}")
|
120 |
+
except Exception as e:
|
121 |
+
st.error(f"Failed to generate answer: {e}")
|
122 |
+
except Exception as e:
|
123 |
+
st.error(f"Error in Q&A section: {e}")
|
124 |
else:
|
125 |
st.info("Upload and parse a document to ask questions.")
|
126 |
|
|
|
128 |
with col3:
|
129 |
st.header("3. Relevant Chunks")
|
130 |
if parsed:
|
131 |
+
try:
|
132 |
+
chunks = parsed.get('chunks', [])
|
133 |
+
for idx, chunk in enumerate(chunks):
|
134 |
+
with st.expander(f"Chunk {idx} - {chunk['type'].capitalize()}"):
|
135 |
+
try:
|
136 |
+
st.write(chunk.get('narration', ''))
|
137 |
+
if 'table_structure' in chunk:
|
138 |
+
st.write("**Parsed Table:**")
|
139 |
+
st.table(chunk['table_structure'])
|
140 |
+
for blk in chunk.get('blocks', []):
|
141 |
+
if blk.get('type') == 'img_path':
|
142 |
+
img_path = os.path.join(parsed['images_dir'], blk.get('img_path',''))
|
143 |
+
if os.path.exists(img_path):
|
144 |
+
st.image(img_path, caption=os.path.basename(img_path))
|
145 |
+
except Exception as e:
|
146 |
+
st.error(f"Error displaying chunk: {e}")
|
147 |
+
st.info(f"Total chunks: {len(chunks)}")
|
148 |
+
except Exception as e:
|
149 |
+
st.error(f"Error displaying chunks: {e}")
|
150 |
else:
|
151 |
st.info("No chunks to display. Parse a document first.")
|
requirements.txt
CHANGED
@@ -4,9 +4,15 @@ mineru>=0.1.0
|
|
4 |
sentence-transformers>=2.2.2
|
5 |
rank-bm25>=0.2.2
|
6 |
redis>=4.5.1
|
|
|
7 |
transformers>=4.29.2
|
8 |
torch>=2.0.0
|
9 |
openai>=0.27.0
|
10 |
huggingface-hub>=0.16.4
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
4 |
sentence-transformers>=2.2.2
|
5 |
rank-bm25>=0.2.2
|
6 |
redis>=4.5.1
|
7 |
+
hnswlib>=0.7.0
|
8 |
transformers>=4.29.2
|
9 |
torch>=2.0.0
|
10 |
openai>=0.27.0
|
11 |
huggingface-hub>=0.16.4
|
12 |
+
langchain>=0.1.9
|
13 |
+
python-dotenv>=1.0.0
|
14 |
+
structlog>=23.1.0
|
15 |
+
bleach>=6.0.0
|
16 |
+
|
17 |
+
# Testing
|
18 |
+
pytest>=7.0
|
src/__init__.py
CHANGED
@@ -19,7 +19,8 @@ def configure_logging():
|
|
19 |
wrapper_class=structlog.stdlib.BoundLogger,
|
20 |
cache_logger_on_first_use=True,
|
21 |
)
|
22 |
-
logging.
|
|
|
23 |
|
24 |
def get_env(name):
|
25 |
val = os.getenv(name)
|
|
|
19 |
wrapper_class=structlog.stdlib.BoundLogger,
|
20 |
cache_logger_on_first_use=True,
|
21 |
)
|
22 |
+
if not logging.getLogger().handlers:
|
23 |
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
24 |
|
25 |
def get_env(name):
|
26 |
val = os.getenv(name)
|
src/config.py
CHANGED
@@ -15,7 +15,9 @@ class EmbeddingConfig:
|
|
15 |
META_MODEL = os.getenv('META_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
|
16 |
|
17 |
class RetrieverConfig:
|
18 |
-
TOP_K = int(os.getenv('RETRIEVER_TOP_K', 10))
|
|
|
|
|
19 |
|
20 |
class RerankerConfig:
|
21 |
MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')
|
|
|
15 |
META_MODEL = os.getenv('META_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
|
16 |
|
17 |
class RetrieverConfig:
|
18 |
+
TOP_K = int(os.getenv('RETRIEVER_TOP_K', 10)) # number of candidates per retrieval path
|
19 |
+
DENSE_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
|
20 |
+
ANN_TOP = int(os.getenv('ANN_TOP', 50))
|
21 |
|
22 |
class RerankerConfig:
|
23 |
MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')
|
src/gpp.py
CHANGED
@@ -8,7 +8,7 @@ This module handles:
|
|
8 |
4. Parsing markdown tables into JSON 2D structures for dense tables
|
9 |
5. Narration of tables/images via LLM
|
10 |
6. Semantic enhancements (deduplication, coreference, metadata summarization)
|
11 |
-
7. Embedding computation
|
12 |
|
13 |
Each step is modular to support swapping components (e.g. different parsers or stores).
|
14 |
"""
|
@@ -27,7 +27,6 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
27 |
from sentence_transformers import SentenceTransformer
|
28 |
from rank_bm25 import BM25Okapi
|
29 |
import numpy as np
|
30 |
-
import redis
|
31 |
|
32 |
# LLM client abstraction
|
33 |
from src.utils import LLMClient
|
@@ -71,22 +70,12 @@ class GPPConfig:
|
|
71 |
TEXT_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
|
72 |
META_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
|
73 |
|
74 |
-
# Redis settings
|
75 |
-
REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
|
76 |
-
REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
|
77 |
-
REDIS_DB = int(os.getenv('REDIS_DB', 0))
|
78 |
-
REDIS_VECTOR_INDEX = 'gpp_vectors'
|
79 |
-
|
80 |
class GPP:
|
81 |
def __init__(self, config: GPPConfig):
|
82 |
self.config = config
|
83 |
# Embedding models
|
84 |
self.text_embedder = SentenceTransformer(config.TEXT_EMBED_MODEL)
|
85 |
self.meta_embedder = SentenceTransformer(config.META_EMBED_MODEL)
|
86 |
-
# Redis for vectors + metadata
|
87 |
-
self.redis = redis.Redis(host=config.REDIS_HOST,
|
88 |
-
port=config.REDIS_PORT,
|
89 |
-
db=config.REDIS_DB)
|
90 |
self.bm25 = None
|
91 |
|
92 |
def parse_pdf(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
|
@@ -179,42 +168,43 @@ class GPP:
|
|
179 |
c['narration'] = c['text']
|
180 |
|
181 |
def deduplicate(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
194 |
|
195 |
def coref_resolution(self, chunks: List[Dict[str, Any]]) -> None:
|
196 |
-
"""
|
197 |
-
Resolve pronouns using preceding context via LLM.
|
198 |
-
"""
|
199 |
for idx, c in enumerate(chunks):
|
200 |
start = max(0, idx-self.config.COREF_CONTEXT_SIZE)
|
201 |
-
ctx = "\n".join(chunks[i]
|
202 |
-
prompt = f"Context:\n{ctx}\nRewrite pronouns in:\n{c
|
203 |
-
|
|
|
|
|
|
|
204 |
|
205 |
def metadata_summarization(self, chunks: List[Dict[str, Any]]) -> None:
|
206 |
-
"""
|
207 |
-
Summarize sections and attach to metadata for self-contained context.
|
208 |
-
"""
|
209 |
sections: Dict[str, List[Dict[str, Any]]] = {}
|
210 |
for c in chunks:
|
211 |
sec = c.get('section', 'default')
|
212 |
sections.setdefault(sec, []).append(c)
|
213 |
for sec, items in sections.items():
|
214 |
-
blob = "\n".join(i
|
215 |
-
|
216 |
-
|
217 |
-
i
|
|
|
|
|
|
|
218 |
|
219 |
def build_bm25(self, chunks: List[Dict[str, Any]]) -> None:
|
220 |
"""
|
@@ -223,31 +213,20 @@ class GPP:
|
|
223 |
tokenized = [c['narration'].split() for c in chunks]
|
224 |
self.bm25 = BM25Okapi(tokenized)
|
225 |
|
226 |
-
def compute_and_store(self, chunks: List[Dict[str, Any]]) -> None:
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
for i, (c, te) in enumerate(zip(chunks, txt_embs)):
|
237 |
-
key = f"chunk:{i}"
|
238 |
-
# store metadata
|
239 |
-
store = {'narration': c['narration'], 'type': c['type']}
|
240 |
-
if 'table_structure' in c:
|
241 |
-
store['table_structure'] = json.dumps(c['table_structure'])
|
242 |
-
pipe.hset(key, mapping=store)
|
243 |
-
# store dense vector
|
244 |
-
pipe.hset(self.config.REDIS_VECTOR_INDEX, key, te.tobytes())
|
245 |
-
pipe.execute()
|
246 |
-
logger.info("Stored embeddings and metadata in Redis.")
|
247 |
|
248 |
def run(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
|
249 |
"""
|
250 |
-
Executes full GPP: parse
|
251 |
Returns parse output dict augmented with `chunks` for downstream processes.
|
252 |
"""
|
253 |
parsed = self.parse_pdf(pdf_path, output_dir)
|
@@ -258,16 +237,7 @@ class GPP:
|
|
258 |
self.coref_resolution(chunks)
|
259 |
self.metadata_summarization(chunks)
|
260 |
self.build_bm25(chunks)
|
261 |
-
self.compute_and_store(chunks)
|
262 |
parsed['chunks'] = chunks
|
263 |
logger.info("GPP pipeline complete.")
|
264 |
return parsed
|
265 |
-
|
266 |
-
if __name__ == '__main__':
|
267 |
-
import argparse
|
268 |
-
parser = argparse.ArgumentParser()
|
269 |
-
parser.add_argument('pdf')
|
270 |
-
parser.add_argument('outdir')
|
271 |
-
args = parser.parse_args()
|
272 |
-
gpp = GPP(GPPConfig())
|
273 |
-
gpp.run(args.pdf, args.outdir)
|
|
|
8 |
4. Parsing markdown tables into JSON 2D structures for dense tables
|
9 |
5. Narration of tables/images via LLM
|
10 |
6. Semantic enhancements (deduplication, coreference, metadata summarization)
|
11 |
+
7. Embedding computation for in-memory use
|
12 |
|
13 |
Each step is modular to support swapping components (e.g. different parsers or stores).
|
14 |
"""
|
|
|
27 |
from sentence_transformers import SentenceTransformer
|
28 |
from rank_bm25 import BM25Okapi
|
29 |
import numpy as np
|
|
|
30 |
|
31 |
# LLM client abstraction
|
32 |
from src.utils import LLMClient
|
|
|
70 |
TEXT_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
|
71 |
META_EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
class GPP:
|
74 |
def __init__(self, config: GPPConfig):
|
75 |
self.config = config
|
76 |
# Embedding models
|
77 |
self.text_embedder = SentenceTransformer(config.TEXT_EMBED_MODEL)
|
78 |
self.meta_embedder = SentenceTransformer(config.META_EMBED_MODEL)
|
|
|
|
|
|
|
|
|
79 |
self.bm25 = None
|
80 |
|
81 |
def parse_pdf(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
|
|
|
168 |
c['narration'] = c['text']
|
169 |
|
170 |
def deduplicate(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
171 |
+
try:
|
172 |
+
embs = self.text_embedder.encode([c.get('narration', '') for c in chunks], convert_to_tensor=True)
|
173 |
+
keep = []
|
174 |
+
for i, emb in enumerate(embs):
|
175 |
+
if not any((emb @ embs[j]).item() / (np.linalg.norm(emb) * np.linalg.norm(embs[j]) + 1e-8)
|
176 |
+
> self.config.DEDUP_SIM_THRESHOLD for j in keep):
|
177 |
+
keep.append(i)
|
178 |
+
deduped = [chunks[i] for i in keep]
|
179 |
+
logger.info(f"Deduplicated: {len(chunks)}→{len(deduped)}")
|
180 |
+
return deduped
|
181 |
+
except Exception as e:
|
182 |
+
logger.error(f"Deduplication failed: {e}")
|
183 |
+
return chunks
|
184 |
|
185 |
def coref_resolution(self, chunks: List[Dict[str, Any]]) -> None:
|
|
|
|
|
|
|
186 |
for idx, c in enumerate(chunks):
|
187 |
start = max(0, idx-self.config.COREF_CONTEXT_SIZE)
|
188 |
+
ctx = "\n".join(chunks[i].get('narration', '') for i in range(start, idx))
|
189 |
+
prompt = f"Context:\n{ctx}\nRewrite pronouns in:\n{c.get('narration', '')}"
|
190 |
+
try:
|
191 |
+
c['narration'] = LLMClient.generate(prompt)
|
192 |
+
except Exception as e:
|
193 |
+
logger.error(f"Coref resolution failed for chunk {idx}: {e}")
|
194 |
|
195 |
def metadata_summarization(self, chunks: List[Dict[str, Any]]) -> None:
|
|
|
|
|
|
|
196 |
sections: Dict[str, List[Dict[str, Any]]] = {}
|
197 |
for c in chunks:
|
198 |
sec = c.get('section', 'default')
|
199 |
sections.setdefault(sec, []).append(c)
|
200 |
for sec, items in sections.items():
|
201 |
+
blob = "\n".join(i.get('narration', '') for i in items)
|
202 |
+
try:
|
203 |
+
summ = LLMClient.generate(f"Summarize this section:\n{blob}")
|
204 |
+
for i in items:
|
205 |
+
i.setdefault('metadata', {})['section_summary'] = summ
|
206 |
+
except Exception as e:
|
207 |
+
logger.error(f"Metadata summarization failed for section {sec}: {e}")
|
208 |
|
209 |
def build_bm25(self, chunks: List[Dict[str, Any]]) -> None:
|
210 |
"""
|
|
|
213 |
tokenized = [c['narration'].split() for c in chunks]
|
214 |
self.bm25 = BM25Okapi(tokenized)
|
215 |
|
216 |
+
# def compute_and_store(self, chunks: List[Dict[str, Any]]) -> None:
|
217 |
+
# try:
|
218 |
+
# txts = [c.get('narration', '') for c in chunks]
|
219 |
+
# metas = [c.get('metadata', {}).get('section_summary', '') for c in chunks]
|
220 |
+
# txt_embs = self.text_embedder.encode(txts)
|
221 |
+
# meta_embs = self.meta_embedder.encode(metas)
|
222 |
+
# # No Redis storage, just keep for in-memory use or return as needed
|
223 |
+
# logger.info("Computed embeddings for chunks.")
|
224 |
+
# except Exception as e:
|
225 |
+
# logger.error(f"Failed to compute embeddings: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
def run(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
|
228 |
"""
|
229 |
+
Executes full GPP: parse -> chunk -> narrate -> enhance -> index.
|
230 |
Returns parse output dict augmented with `chunks` for downstream processes.
|
231 |
"""
|
232 |
parsed = self.parse_pdf(pdf_path, output_dir)
|
|
|
237 |
self.coref_resolution(chunks)
|
238 |
self.metadata_summarization(chunks)
|
239 |
self.build_bm25(chunks)
|
240 |
+
# self.compute_and_store(chunks)
|
241 |
parsed['chunks'] = chunks
|
242 |
logger.info("GPP pipeline complete.")
|
243 |
return parsed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/qa.py
CHANGED
@@ -25,7 +25,7 @@ from src.retriever import Retriever, RetrieverConfig
|
|
25 |
|
26 |
|
27 |
class RerankerConfig:
|
28 |
-
MODEL_NAME = 'BAAI/bge-reranker-v2-Gemma'
|
29 |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
30 |
|
31 |
class Reranker:
|
@@ -33,26 +33,36 @@ class Reranker:
|
|
33 |
Cross-encoder re-ranker using a transformer-based sequence classification model.
|
34 |
"""
|
35 |
def __init__(self, config: RerankerConfig):
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
39 |
|
40 |
def rerank(self, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]:
|
41 |
"""Score each candidate and return top_k sorted by relevance."""
|
42 |
-
|
43 |
-
|
44 |
-
[
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
|
58 |
class AnswerGenerator:
|
@@ -66,23 +76,23 @@ class AnswerGenerator:
|
|
66 |
def answer(self, chunks: List[Dict[str, Any]], question: str) -> Tuple[str, List[Dict[str, Any]]]:
|
67 |
logger.info('Answering question', question=question)
|
68 |
question = sanitize_html(question)
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
|
87 |
# Example usage:
|
88 |
# generator = AnswerGenerator()
|
|
|
25 |
|
26 |
|
27 |
class RerankerConfig:
|
28 |
+
MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')
|
29 |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
30 |
|
31 |
class Reranker:
|
|
|
33 |
Cross-encoder re-ranker using a transformer-based sequence classification model.
|
34 |
"""
|
35 |
def __init__(self, config: RerankerConfig):
|
36 |
+
try:
|
37 |
+
self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
|
38 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME)
|
39 |
+
self.model.to(config.DEVICE)
|
40 |
+
except Exception as e:
|
41 |
+
logger.error(f'Failed to load reranker model: {e}')
|
42 |
+
raise
|
43 |
|
44 |
def rerank(self, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]:
|
45 |
"""Score each candidate and return top_k sorted by relevance."""
|
46 |
+
if not candidates:
|
47 |
+
logger.warning('No candidates provided to rerank.')
|
48 |
+
return []
|
49 |
+
try:
|
50 |
+
inputs = self.tokenizer(
|
51 |
+
[query] * len(candidates),
|
52 |
+
[c.get('narration', '') for c in candidates],
|
53 |
+
padding=True,
|
54 |
+
truncation=True,
|
55 |
+
return_tensors='pt'
|
56 |
+
).to(RerankerConfig.DEVICE)
|
57 |
+
with torch.no_grad():
|
58 |
+
logits = self.model(**inputs).logits.squeeze(-1)
|
59 |
+
scores = torch.sigmoid(logits).cpu().numpy()
|
60 |
+
paired = list(zip(candidates, scores))
|
61 |
+
ranked = sorted(paired, key=lambda x: x[1], reverse=True)
|
62 |
+
return [c for c, _ in ranked[:top_k]]
|
63 |
+
except Exception as e:
|
64 |
+
logger.error(f'Reranking failed: {e}')
|
65 |
+
return candidates[:top_k]
|
66 |
|
67 |
|
68 |
class AnswerGenerator:
|
|
|
76 |
def answer(self, chunks: List[Dict[str, Any]], question: str) -> Tuple[str, List[Dict[str, Any]]]:
|
77 |
logger.info('Answering question', question=question)
|
78 |
question = sanitize_html(question)
|
79 |
+
try:
|
80 |
+
retriever = Retriever(chunks, self.ret_config)
|
81 |
+
candidates = retriever.retrieve(question)
|
82 |
+
reranker = Reranker(self.rerank_config)
|
83 |
+
top_chunks = reranker.rerank(question, candidates, top_k=5)
|
84 |
+
context = "\n\n".join([f"- {c.get('narration', '')}" for c in top_chunks])
|
85 |
+
prompt = (
|
86 |
+
f"You are a knowledgeable assistant. "
|
87 |
+
f"Use the following extracted document snippets to answer the question."
|
88 |
+
f"\n\nContext:\n{context}"
|
89 |
+
f"\n\nQuestion: {question}\nAnswer:"
|
90 |
+
)
|
91 |
+
answer = LLMClient.generate(prompt)
|
92 |
+
return answer, top_chunks
|
93 |
+
except Exception as e:
|
94 |
+
logger.error(f'Failed to answer question: {e}')
|
95 |
+
return "Failed to generate answer due to error.", []
|
96 |
|
97 |
# Example usage:
|
98 |
# generator = AnswerGenerator()
|
src/retriever.py
CHANGED
@@ -1,64 +1,107 @@
|
|
1 |
import os
|
2 |
import numpy as np
|
3 |
-
import redis
|
4 |
import hnswlib
|
5 |
from typing import List, Dict, Any
|
6 |
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
from rank_bm25 import BM25Okapi
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
|
14 |
-
REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
|
15 |
-
REDIS_DB = int(os.getenv('REDIS_DB', 0))
|
16 |
-
REDIS_VECTOR_INDEX = 'gpp_vectors'
|
17 |
|
18 |
class Retriever:
|
19 |
"""
|
20 |
-
Hybrid retriever combining BM25 sparse and
|
21 |
"""
|
22 |
def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
|
23 |
-
|
24 |
-
|
25 |
-
corpus = [c['narration'].split() for c in chunks]
|
26 |
-
self.bm25 = BM25Okapi(corpus)
|
27 |
-
# Load dense embedder
|
28 |
-
self.embedder = SentenceTransformer(config.DENSE_MODEL)
|
29 |
-
# Connect to Redis for vector store
|
30 |
-
self.redis = redis.Redis(host=config.REDIS_HOST,
|
31 |
-
port=config.REDIS_PORT,
|
32 |
-
db=config.REDIS_DB)
|
33 |
-
self.vector_index = config.REDIS_VECTOR_INDEX
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
def retrieve_sparse(self, query: str, top_k: int) -> List[Dict[str, Any]]:
|
44 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
tokenized = query.split()
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
49 |
|
50 |
def retrieve_dense(self, query: str, top_k: int) -> List[Dict[str, Any]]:
|
51 |
-
"""
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
59 |
sparse = self.retrieve_sparse(query, top_k)
|
60 |
dense = self.retrieve_dense(query, top_k)
|
61 |
-
# Union while preserving order
|
62 |
seen = set()
|
63 |
combined = []
|
64 |
for c in sparse + dense:
|
|
|
1 |
import os
|
2 |
import numpy as np
|
|
|
3 |
import hnswlib
|
4 |
from typing import List, Dict, Any
|
5 |
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from rank_bm25 import BM25Okapi
|
8 |
|
9 |
+
from src.config import RetrieverConfig
|
10 |
+
from src import logger
|
11 |
+
|
|
|
|
|
|
|
|
|
12 |
|
13 |
class Retriever:
|
14 |
"""
|
15 |
+
Hybrid retriever combining BM25 sparse and dense retrieval (no Redis).
|
16 |
"""
|
17 |
def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
|
18 |
+
"""
|
19 |
+
Initialize the retriever with chunks and configuration.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
Args:
|
22 |
+
chunks (List[Dict[str, Any]]): List of chunks, where each chunk is a dictionary.
|
23 |
+
config (RetrieverConfig): Configuration for the retriever.
|
24 |
+
"""
|
25 |
+
self.chunks = chunks
|
26 |
+
try:
|
27 |
+
if not isinstance(chunks, list) or not all(isinstance(c, dict) for c in chunks):
|
28 |
+
logger.error("Chunks must be a list of dicts.")
|
29 |
+
raise ValueError("Chunks must be a list of dicts.")
|
30 |
+
corpus = [c.get('narration', '').split() for c in chunks]
|
31 |
+
self.bm25 = BM25Okapi(corpus)
|
32 |
+
self.embedder = SentenceTransformer(config.DENSE_MODEL)
|
33 |
+
dim = len(self.embedder.encode(["test"])[0])
|
34 |
+
self.ann = hnswlib.Index(space='cosine', dim=dim)
|
35 |
+
self.ann.init_index(max_elements=len(chunks))
|
36 |
+
embeddings = self.embedder.encode([c.get('narration', '') for c in chunks])
|
37 |
+
self.ann.add_items(embeddings, ids=list(range(len(chunks))))
|
38 |
+
self.ann.set_ef(config.ANN_TOP)
|
39 |
+
except Exception as e:
|
40 |
+
logger.error(f"Retriever init failed: {e}")
|
41 |
+
self.bm25 = None
|
42 |
+
self.embedder = None
|
43 |
+
self.ann = None
|
44 |
|
45 |
def retrieve_sparse(self, query: str, top_k: int) -> List[Dict[str, Any]]:
|
46 |
+
"""
|
47 |
+
Retrieve chunks using BM25 sparse retrieval.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
query (str): Query string.
|
51 |
+
top_k (int): Number of top chunks to return.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
List[Dict[str, Any]]: List of top chunks.
|
55 |
+
"""
|
56 |
+
if not self.bm25:
|
57 |
+
logger.error("BM25 not initialized.")
|
58 |
+
return []
|
59 |
tokenized = query.split()
|
60 |
+
try:
|
61 |
+
scores = self.bm25.get_scores(tokenized)
|
62 |
+
top_indices = np.argsort(scores)[::-1][:top_k]
|
63 |
+
return [self.chunks[i] for i in top_indices]
|
64 |
+
except Exception as e:
|
65 |
+
logger.error(f"Sparse retrieval failed: {e}")
|
66 |
+
return []
|
67 |
|
68 |
def retrieve_dense(self, query: str, top_k: int) -> List[Dict[str, Any]]:
|
69 |
+
"""
|
70 |
+
Retrieve chunks using dense retrieval.
|
71 |
+
|
72 |
+
Args:
|
73 |
+
query (str): Query string.
|
74 |
+
top_k (int): Number of top chunks to return.
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
List[Dict[str, Any]]: List of top chunks.
|
78 |
+
"""
|
79 |
+
if not self.ann or not self.embedder:
|
80 |
+
logger.error("Dense retriever not initialized.")
|
81 |
+
return []
|
82 |
+
try:
|
83 |
+
q_emb = self.embedder.encode([query])[0]
|
84 |
+
labels, distances = self.ann.knn_query(q_emb, k=top_k)
|
85 |
+
return [self.chunks[i] for i in labels[0]]
|
86 |
+
except Exception as e:
|
87 |
+
logger.error(f"Dense retrieval failed: {e}")
|
88 |
+
return []
|
89 |
+
|
90 |
+
def retrieve(self, query: str, top_k: int = None) -> List[Dict[str, Any]]:
|
91 |
+
"""
|
92 |
+
Retrieve chunks using hybrid retrieval.
|
93 |
+
|
94 |
+
Args:
|
95 |
+
query (str): Query string.
|
96 |
+
top_k (int, optional): Number of top chunks to return. Defaults to None.
|
97 |
|
98 |
+
Returns:
|
99 |
+
List[Dict[str, Any]]: List of top chunks.
|
100 |
+
"""
|
101 |
+
if top_k is None:
|
102 |
+
top_k = RetrieverConfig.TOP_K
|
103 |
sparse = self.retrieve_sparse(query, top_k)
|
104 |
dense = self.retrieve_dense(query, top_k)
|
|
|
105 |
seen = set()
|
106 |
combined = []
|
107 |
for c in sparse + dense:
|
src/utils.py
CHANGED
@@ -3,27 +3,13 @@ Utilities module: LLM client wrapper and shared helpers.
|
|
3 |
"""
|
4 |
import os
|
5 |
import openai
|
6 |
-
import
|
7 |
-
import sys
|
8 |
-
import structlog
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
structlog.processors.JSONRenderer()
|
16 |
-
],
|
17 |
-
context_class=dict,
|
18 |
-
logger_factory=structlog.stdlib.LoggerFactory(),
|
19 |
-
wrapper_class=structlog.stdlib.BoundLogger,
|
20 |
-
cache_logger_on_first_use=True,
|
21 |
-
)
|
22 |
-
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
23 |
-
|
24 |
-
|
25 |
-
configure_logging()
|
26 |
-
logger = structlog.get_logger()
|
27 |
|
28 |
class LLMClient:
|
29 |
"""
|
@@ -32,15 +18,22 @@ class LLMClient:
|
|
32 |
"""
|
33 |
@staticmethod
|
34 |
def generate(prompt: str, model: str = None, max_tokens: int = 512, **kwargs) -> str:
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
37 |
logger.error('OPENAI_API_KEY is not set')
|
38 |
raise EnvironmentError('Missing OPENAI_API_KEY')
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
41 |
try:
|
42 |
-
resp =
|
43 |
-
model=
|
44 |
messages=[{"role": "system", "content": "You are a helpful assistant."},
|
45 |
{"role": "user", "content": prompt}],
|
46 |
max_tokens=max_tokens,
|
@@ -49,7 +42,9 @@ class LLMClient:
|
|
49 |
)
|
50 |
text = resp.choices[0].message.content.strip()
|
51 |
return text
|
|
|
|
|
|
|
52 |
except Exception as e:
|
53 |
logger.exception('LLM generation failed')
|
54 |
raise
|
55 |
-
|
|
|
3 |
"""
|
4 |
import os
|
5 |
import openai
|
6 |
+
from openai import AzureOpenAI, error
|
|
|
|
|
7 |
|
8 |
+
try:
|
9 |
+
from src.utils import logger
|
10 |
+
except ImportError:
|
11 |
+
import structlog
|
12 |
+
logger = structlog.get_logger()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
class LLMClient:
|
15 |
"""
|
|
|
18 |
"""
|
19 |
@staticmethod
|
20 |
def generate(prompt: str, model: str = None, max_tokens: int = 512, **kwargs) -> str:
|
21 |
+
azure_api_key = os.getenv('AZURE_API_KEY')
|
22 |
+
azure_endpoint = os.getenv('AZURE_ENDPOINT')
|
23 |
+
azure_api_version = os.getenv('AZURE_API_VERSION')
|
24 |
+
openai_model_name = model or os.getenv('OPENAI_MODEL', 'gpt-4o')
|
25 |
+
|
26 |
+
if not (azure_api_key or azure_endpoint or azure_api_version or openai_model_name):
|
27 |
logger.error('OPENAI_API_KEY is not set')
|
28 |
raise EnvironmentError('Missing OPENAI_API_KEY')
|
29 |
+
client = AzureOpenAI(
|
30 |
+
api_key=azure_api_key,
|
31 |
+
azure_endpoint=azure_endpoint,
|
32 |
+
api_version=azure_api_version
|
33 |
+
)
|
34 |
try:
|
35 |
+
resp = client.ChatCompletion.create(
|
36 |
+
model=openai_model_name,
|
37 |
messages=[{"role": "system", "content": "You are a helpful assistant."},
|
38 |
{"role": "user", "content": prompt}],
|
39 |
max_tokens=max_tokens,
|
|
|
42 |
)
|
43 |
text = resp.choices[0].message.content.strip()
|
44 |
return text
|
45 |
+
except openai.error.OpenAIError as oe:
|
46 |
+
logger.error(f'OpenAI API error: {oe}')
|
47 |
+
raise
|
48 |
except Exception as e:
|
49 |
logger.exception('LLM generation failed')
|
50 |
raise
|
|