Abhinav Gavireddi commited on
Commit
c613bb1
·
1 Parent(s): f5d3669

[fix]: Fixed code issues

Browse files
.github/workflows/{ci.yaml → pipeline.yaml} RENAMED
File without changes
src/gpp.py CHANGED
@@ -19,7 +19,8 @@ from typing import List, Dict, Any, Optional
19
  import re
20
 
21
  from src import EmbeddingConfig, GPPConfig
22
- from src.utils import OpenAIEmbedder, LLMClient, logger
 
23
 
24
  def parse_markdown_table(md: str) -> Optional[Dict[str, Any]]:
25
  """
 
19
  import re
20
 
21
  from src import EmbeddingConfig, GPPConfig
22
+ from src.utils import OpenAIEmbedder, LLMClient
23
+ from src import logger
24
 
25
  def parse_markdown_table(md: str) -> Optional[Dict[str, Any]]:
26
  """
src/qa.py CHANGED
@@ -10,6 +10,7 @@ Each component is modular and can be swapped or extended (e.g., add HyDE retriev
10
  """
11
  import os
12
  from typing import List, Dict, Any, Tuple
 
13
 
14
  from src import RerankerConfig, logger
15
  from src.utils import LLMClient
@@ -19,13 +20,18 @@ class Reranker:
19
  """
20
  Cross-encoder re-ranker using a transformer-based sequence classification model.
21
  """
 
 
 
 
 
 
 
 
 
22
  def __init__(self, config: RerankerConfig):
23
  try:
24
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
25
- import torch
26
- self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
27
- self.model = AutoModelForSequenceClassification.from_pretrained(config.MODEL_NAME)
28
- self.model.to(config.DEVICE)
29
  except Exception as e:
30
  logger.error(f'Failed to load reranker model: {e}')
31
  raise
 
10
  """
11
  import os
12
  from typing import List, Dict, Any, Tuple
13
+ import streamlit as st
14
 
15
  from src import RerankerConfig, logger
16
  from src.utils import LLMClient
 
20
  """
21
  Cross-encoder re-ranker using a transformer-based sequence classification model.
22
  """
23
+ @staticmethod
24
+ @st.cache_resource(show_spinner="Loading reranker model...")
25
+ def load_model_and_tokenizer(model_name, device):
26
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
27
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
28
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
29
+ model.to(device)
30
+ return tokenizer, model
31
+
32
  def __init__(self, config: RerankerConfig):
33
  try:
34
+ self.tokenizer, self.model = self.load_model_and_tokenizer(config.MODEL_NAME, config.DEVICE)
 
 
 
 
35
  except Exception as e:
36
  logger.error(f'Failed to load reranker model: {e}')
37
  raise
src/retriever.py CHANGED
@@ -1,18 +1,23 @@
1
  import os
2
  from typing import List, Dict, Any
 
3
 
4
- from src.config import RetrieverConfig
5
- from src import logger # Use logger from src/__init__.py
6
 
7
  class Retriever:
8
  """
9
  Hybrid retriever combining BM25 sparse and dense retrieval (no Redis).
10
  """
 
 
 
 
 
 
11
  def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
12
  # Lazy import heavy libraries
13
  import numpy as np
14
  import hnswlib
15
- from sentence_transformers import SentenceTransformer
16
  from rank_bm25 import BM25Okapi
17
  self.chunks = chunks
18
  try:
@@ -21,7 +26,7 @@ class Retriever:
21
  raise ValueError("Chunks must be a list of dicts.")
22
  corpus = [c.get('narration', '').split() for c in chunks]
23
  self.bm25 = BM25Okapi(corpus)
24
- self.embedder = SentenceTransformer(config.DENSE_MODEL)
25
  dim = len(self.embedder.encode(["test"])[0])
26
  self.ann = hnswlib.Index(space='cosine', dim=dim)
27
  self.ann.init_index(max_elements=len(chunks))
 
1
  import os
2
  from typing import List, Dict, Any
3
+ import streamlit as st
4
 
5
+ from src import RetrieverConfig, logger
 
6
 
7
  class Retriever:
8
  """
9
  Hybrid retriever combining BM25 sparse and dense retrieval (no Redis).
10
  """
11
+ @staticmethod
12
+ @st.cache_resource(show_spinner="Loading embedding model...")
13
+ def load_embedder(model_name):
14
+ from sentence_transformers import SentenceTransformer
15
+ return SentenceTransformer(model_name)
16
+
17
  def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
18
  # Lazy import heavy libraries
19
  import numpy as np
20
  import hnswlib
 
21
  from rank_bm25 import BM25Okapi
22
  self.chunks = chunks
23
  try:
 
26
  raise ValueError("Chunks must be a list of dicts.")
27
  corpus = [c.get('narration', '').split() for c in chunks]
28
  self.bm25 = BM25Okapi(corpus)
29
+ self.embedder = self.load_embedder(config.DENSE_MODEL)
30
  dim = len(self.embedder.encode(["test"])[0])
31
  self.ann = hnswlib.Index(space='cosine', dim=dim)
32
  self.ann.init_index(max_elements=len(chunks))
src/utils.py CHANGED
@@ -6,7 +6,7 @@ import openai
6
  from typing import List
7
  from openai import AzureOpenAI
8
  from langchain_openai import AzureOpenAIEmbeddings
9
- from src import logger # Import logger from src/__init__.py
10
 
11
 
12
  class LLMClient:
 
6
  from typing import List
7
  from openai import AzureOpenAI
8
  from langchain_openai import AzureOpenAIEmbeddings
9
+ from src import logger
10
 
11
 
12
  class LLMClient: