Abhinav Gavireddi
commited on
Commit
·
c613bb1
1
Parent(s):
f5d3669
[fix]: Fixed code issues
Browse files- .github/workflows/{ci.yaml → pipeline.yaml} +0 -0
- src/gpp.py +2 -1
- src/qa.py +11 -5
- src/retriever.py +9 -4
- src/utils.py +1 -1
.github/workflows/{ci.yaml → pipeline.yaml}
RENAMED
File without changes
|
src/gpp.py
CHANGED
@@ -19,7 +19,8 @@ from typing import List, Dict, Any, Optional
|
|
19 |
import re
|
20 |
|
21 |
from src import EmbeddingConfig, GPPConfig
|
22 |
-
from src.utils import OpenAIEmbedder, LLMClient
|
|
|
23 |
|
24 |
def parse_markdown_table(md: str) -> Optional[Dict[str, Any]]:
|
25 |
"""
|
|
|
19 |
import re
|
20 |
|
21 |
from src import EmbeddingConfig, GPPConfig
|
22 |
+
from src.utils import OpenAIEmbedder, LLMClient
|
23 |
+
from src import logger
|
24 |
|
25 |
def parse_markdown_table(md: str) -> Optional[Dict[str, Any]]:
|
26 |
"""
|
src/qa.py
CHANGED
@@ -10,6 +10,7 @@ Each component is modular and can be swapped or extended (e.g., add HyDE retriev
|
|
10 |
"""
|
11 |
import os
|
12 |
from typing import List, Dict, Any, Tuple
|
|
|
13 |
|
14 |
from src import RerankerConfig, logger
|
15 |
from src.utils import LLMClient
|
@@ -19,13 +20,18 @@ class Reranker:
|
|
19 |
"""
|
20 |
Cross-encoder re-ranker using a transformer-based sequence classification model.
|
21 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
def __init__(self, config: RerankerConfig):
|
23 |
try:
|
24 |
-
|
25 |
-
import torch
|
26 |
-
self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
|
27 |
-
self.model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME)
|
28 |
-
self.model.to(config.DEVICE)
|
29 |
except Exception as e:
|
30 |
logger.error(f'Failed to load reranker model: {e}')
|
31 |
raise
|
|
|
10 |
"""
|
11 |
import os
|
12 |
from typing import List, Dict, Any, Tuple
|
13 |
+
import streamlit as st
|
14 |
|
15 |
from src import RerankerConfig, logger
|
16 |
from src.utils import LLMClient
|
|
|
20 |
"""
|
21 |
Cross-encoder re-ranker using a transformer-based sequence classification model.
|
22 |
"""
|
23 |
+
@staticmethod
|
24 |
+
@st.cache_resource(show_spinner="Loading reranker model...")
|
25 |
+
def load_model_and_tokenizer(model_name, device):
|
26 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
27 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
28 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
29 |
+
model.to(device)
|
30 |
+
return tokenizer, model
|
31 |
+
|
32 |
def __init__(self, config: RerankerConfig):
|
33 |
try:
|
34 |
+
self.tokenizer, self.model = self.load_model_and_tokenizer(config.MODEL_NAME, config.DEVICE)
|
|
|
|
|
|
|
|
|
35 |
except Exception as e:
|
36 |
logger.error(f'Failed to load reranker model: {e}')
|
37 |
raise
|
src/retriever.py
CHANGED
@@ -1,18 +1,23 @@
|
|
1 |
import os
|
2 |
from typing import List, Dict, Any
|
|
|
3 |
|
4 |
-
from src
|
5 |
-
from src import logger # Use logger from src/__init__.py
|
6 |
|
7 |
class Retriever:
|
8 |
"""
|
9 |
Hybrid retriever combining BM25 sparse and dense retrieval (no Redis).
|
10 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
|
12 |
# Lazy import heavy libraries
|
13 |
import numpy as np
|
14 |
import hnswlib
|
15 |
-
from sentence_transformers import SentenceTransformer
|
16 |
from rank_bm25 import BM25Okapi
|
17 |
self.chunks = chunks
|
18 |
try:
|
@@ -21,7 +26,7 @@ class Retriever:
|
|
21 |
raise ValueError("Chunks must be a list of dicts.")
|
22 |
corpus = [c.get('narration', '').split() for c in chunks]
|
23 |
self.bm25 = BM25Okapi(corpus)
|
24 |
-
self.embedder =
|
25 |
dim = len(self.embedder.encode(["test"])[0])
|
26 |
self.ann = hnswlib.Index(space='cosine', dim=dim)
|
27 |
self.ann.init_index(max_elements=len(chunks))
|
|
|
1 |
import os
|
2 |
from typing import List, Dict, Any
|
3 |
+
import streamlit as st
|
4 |
|
5 |
+
from src import RetrieverConfig, logger
|
|
|
6 |
|
7 |
class Retriever:
|
8 |
"""
|
9 |
Hybrid retriever combining BM25 sparse and dense retrieval (no Redis).
|
10 |
"""
|
11 |
+
@staticmethod
|
12 |
+
@st.cache_resource(show_spinner="Loading embedding model...")
|
13 |
+
def load_embedder(model_name):
|
14 |
+
from sentence_transformers import SentenceTransformer
|
15 |
+
return SentenceTransformer(model_name)
|
16 |
+
|
17 |
def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
|
18 |
# Lazy import heavy libraries
|
19 |
import numpy as np
|
20 |
import hnswlib
|
|
|
21 |
from rank_bm25 import BM25Okapi
|
22 |
self.chunks = chunks
|
23 |
try:
|
|
|
26 |
raise ValueError("Chunks must be a list of dicts.")
|
27 |
corpus = [c.get('narration', '').split() for c in chunks]
|
28 |
self.bm25 = BM25Okapi(corpus)
|
29 |
+
self.embedder = self.load_embedder(config.DENSE_MODEL)
|
30 |
dim = len(self.embedder.encode(["test"])[0])
|
31 |
self.ann = hnswlib.Index(space='cosine', dim=dim)
|
32 |
self.ann.init_index(max_elements=len(chunks))
|
src/utils.py
CHANGED
@@ -6,7 +6,7 @@ import openai
|
|
6 |
from typing import List
|
7 |
from openai import AzureOpenAI
|
8 |
from langchain_openai import AzureOpenAIEmbeddings
|
9 |
-
from src import logger
|
10 |
|
11 |
|
12 |
class LLMClient:
|
|
|
6 |
from typing import List
|
7 |
from openai import AzureOpenAI
|
8 |
from langchain_openai import AzureOpenAIEmbeddings
|
9 |
+
from src import logger
|
10 |
|
11 |
|
12 |
class LLMClient:
|