Spaces:

ankanghosh
/

anveshak

Sleeping

App Files Files Community

ankanghosh commited on Mar 20

Commit

6aa479a

verified ·

1 Parent(s): 1ab74f5

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +66 -63

rag_engine.py CHANGED Viewed

@@ -11,23 +11,22 @@ import textwrap
 import unicodedata
 import streamlit as st
 from utils import setup_gcp_auth, setup_openai_auth
-import gc  # Added for explicit garbage collection
 # Force model to CPU for stability
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
-# Create a function to initialize session state
-def initialize_session_state():
-    if 'model_initialized' not in st.session_state:
-        st.session_state.model_initialized = False
-        st.session_state.model = None
-        st.session_state.tokenizer = None
-        st.session_state.device = torch.device("cpu")
-        st.session_state.data_loaded = False
-        print("Initialized session state variables")
-# Call the initialization function right away
-initialize_session_state()
 # Load GCP authentication from utility function
 def setup_gcp_client():
@@ -52,59 +51,49 @@ def setup_openai_client():
         print(f"❌ OpenAI client initialization error: {str(e)}")
         return False
-# GCS Paths
-metadata_file_gcs = "metadata/metadata.jsonl"
-embeddings_file_gcs = "processed/embeddings/all_embeddings.npy"
-faiss_index_file_gcs = "processed/indices/faiss_index.faiss"
-text_chunks_file_gcs = "processed/chunks/text_chunks.txt"
-# Local Paths
-local_embeddings_file = "all_embeddings.npy"
-local_faiss_index_file = "faiss_index.faiss"
-local_text_chunks_file = "text_chunks.txt"
-local_metadata_file = "metadata.jsonl"
 def load_model():
     try:
-        # Check if model is already loaded
-        if st.session_state.model is not None and st.session_state.tokenizer is not None:
-            print("Model already loaded, reusing existing instance")
             return st.session_state.tokenizer, st.session_state.model
-        # Force model to CPU - more stable than GPU for this use case
-        os.environ["CUDA_VISIBLE_DEVICES"] = ""
-        print("Loading tokenizer...")
-        tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-small-v2")
-        print("Loading model...")
         model = AutoModel.from_pretrained(
             "intfloat/e5-small-v2",
-            torch_dtype=torch.float16  # Use half precision
         )
-        # Move model to CPU explicitly
-        model = model.to('cpu')
         model.eval()
         torch.set_grad_enabled(False)
         # Store in session state
         st.session_state.tokenizer = tokenizer
         st.session_state.model = model
-        st.session_state.model_initialized = True
         print("✅ Model loaded successfully")
         return tokenizer, model
     except Exception as e:
         print(f"❌ Error loading model: {str(e)}")
-        # Return None values instead of raising to avoid crashing
         return None, None
 def download_file_from_gcs(bucket, gcs_path, local_path):
     """Download a file from GCS to local storage."""
     try:
-        # Check if file already exists locally
         if os.path.exists(local_path):
             print(f"File already exists locally: {local_path}")
             return True
@@ -118,12 +107,13 @@ def download_file_from_gcs(bucket, gcs_path, local_path):
         return False
 def load_data_files():
     # Check if already loaded in session state
-    if hasattr(st.session_state, 'faiss_index') and st.session_state.faiss_index is not None:
         print("Using cached data files from session state")
         return st.session_state.faiss_index, st.session_state.text_chunks, st.session_state.metadata_dict
-    # Initialize GCP and OpenAI clients
     bucket = setup_gcp_client()
     openai_initialized = setup_openai_client()
@@ -160,24 +150,23 @@ def load_data_files():
         print(f"❌ Error loading text chunks: {str(e)}")
         return None, None, None
-    # Load metadata.jsonl for publisher information
     try:
         metadata_dict = {}
         with open(local_metadata_file, "r", encoding="utf-8") as f:
             for line in f:
                 item = json.loads(line)
-                metadata_dict[item["Title"]] = item  # Store for easy lookup
     except Exception as e:
         print(f"❌ Error loading metadata: {str(e)}")
         return None, None, None
-    print(f"✅ FAISS index and text chunks loaded. {len(text_chunks)} passages available.")
     # Store in session state
     st.session_state.faiss_index = faiss_index
     st.session_state.text_chunks = text_chunks
     st.session_state.metadata_dict = metadata_dict
-    st.session_state.data_loaded = True
     return faiss_index, text_chunks, metadata_dict
@@ -186,25 +175,31 @@ def average_pool(last_hidden_states, attention_mask):
     last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 query_embedding_cache = {}
 def get_embedding(text):
     if text in query_embedding_cache:
         return query_embedding_cache[text]
     try:
-        # Ensure model initialization
-        if not hasattr(st.session_state, 'model') or st.session_state.model is None:
             tokenizer, model = load_model()
-            if model is None:
-                return np.zeros((1, 384), dtype=np.float32)
         else:
             tokenizer, model = st.session_state.tokenizer, st.session_state.model
         # Prepare text
         input_text = f"query: {text}" if len(text) < 512 else f"passage: {text}"
-        # Explicitly specify truncation parameters
         inputs = tokenizer(
             input_text,
             padding=True,
@@ -214,20 +209,18 @@ def get_embedding(text):
             return_attention_mask=True
         )
-        # Move to CPU explicitly
-        inputs = {k: v.to('cpu') for k, v in inputs.items()}
         with torch.no_grad():
             outputs = model(**inputs)
             embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
             embeddings = nn.functional.normalize(embeddings, p=2, dim=1)
             embeddings = embeddings.detach().cpu().numpy()
-        # Explicitly clean up
         del outputs, inputs
         gc.collect()
-        torch.cuda.empty_cache() if torch.cuda.is_available() else None
         query_embedding_cache[text] = embeddings
         return embeddings
     except Exception as e:
@@ -238,7 +231,11 @@ def retrieve_passages(query, faiss_index, text_chunks, metadata_dict, top_k=5, s
     """Retrieve top-k most relevant passages using FAISS with metadata."""
     try:
         print(f"\n🔍 Retrieving passages for query: {query}")
         query_embedding = get_embedding(query)
         distances, indices = faiss_index.search(query_embedding, top_k * 2)
         print(f"Found {len(distances[0])} potential matches")
@@ -246,29 +243,31 @@ def retrieve_passages(query, faiss_index, text_chunks, metadata_dict, top_k=5, s
         retrieved_sources = []
         cited_titles = set()
         for dist, idx in zip(distances[0], indices[0]):
             print(f"Distance: {dist:.4f}, Index: {idx}")
             if idx in text_chunks and dist >= similarity_threshold:
                 title_with_txt, author, text = text_chunks[idx]
-                # Normalize title and remove .txt
                 clean_title = title_with_txt.replace(".txt", "") if title_with_txt.endswith(".txt") else title_with_txt
                 clean_title = unicodedata.normalize("NFC", clean_title)
-                # Ensure unique citations
                 if clean_title in cited_titles:
                     continue
-                # Get metadata safely
                 metadata_entry = metadata_dict.get(clean_title, {})
                 author = metadata_entry.get("Author", "Unknown")
                 publisher = metadata_entry.get("Publisher", "Unknown")
                 cited_titles.add(clean_title)
                 retrieved_passages.append(text)
                 retrieved_sources.append((clean_title, author, publisher))
                 if len(retrieved_passages) == top_k:
                     break
@@ -279,10 +278,9 @@ def retrieve_passages(query, faiss_index, text_chunks, metadata_dict, top_k=5, s
         return [], []
 def answer_with_llm(query, context=None, word_limit=100):
-    """
-    Generate an answer using OpenAI GPT model with formatted citations.
-    """
     try:
         if context:
             formatted_contexts = []
             total_chars = 0
@@ -312,6 +310,7 @@ def answer_with_llm(query, context=None, word_limit=100):
             "Ensure proper citation and do not include direct excerpts."
         )
         user_message = f"""
         Context:
         {formatted_context}
@@ -319,6 +318,7 @@ def answer_with_llm(query, context=None, word_limit=100):
         {query}
         """
         response = openai.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[
@@ -371,6 +371,7 @@ def process_query(query, top_k=5, word_limit=100):
             "citations": "No citations available."
         }
     retrieved_context, retrieved_sources = retrieve_passages(
         query,
         faiss_index,
@@ -379,8 +380,10 @@ def process_query(query, top_k=5, word_limit=100):
         top_k=top_k
     )
     sources = format_citations(retrieved_sources) if retrieved_sources else "No citation available."
     if retrieved_context:
         context_with_sources = list(zip(retrieved_sources, retrieved_context))
         llm_answer_with_rag = answer_with_llm(query, context_with_sources, word_limit=word_limit)

 import unicodedata
 import streamlit as st
 from utils import setup_gcp_auth, setup_openai_auth
+import gc
 # Force model to CPU for stability
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
+# GCS Paths
+metadata_file_gcs = "metadata/metadata.jsonl"
+embeddings_file_gcs = "processed/embeddings/all_embeddings.npy"
+faiss_index_file_gcs = "processed/indices/faiss_index.faiss"
+text_chunks_file_gcs = "processed/chunks/text_chunks.txt"
+# Local Paths
+local_embeddings_file = "all_embeddings.npy"
+local_faiss_index_file = "faiss_index.faiss"
+local_text_chunks_file = "text_chunks.txt"
+local_metadata_file = "metadata.jsonl"
 # Load GCP authentication from utility function
 def setup_gcp_client():
         print(f"❌ OpenAI client initialization error: {str(e)}")
         return False
 def load_model():
+    """Load the embedding model and store in session state"""
     try:
+        # Check if model already loaded
+        if 'model' in st.session_state and st.session_state.model is not None:
+            print("Model already loaded in session state")
             return st.session_state.tokenizer, st.session_state.model
+        print("Loading new model instance...")
+        # Force model to CPU
+        device = torch.device("cpu")
+        # Load tokenizer and model
+        tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-small-v2")
         model = AutoModel.from_pretrained(
             "intfloat/e5-small-v2",
+            torch_dtype=torch.float16
         )
+        # Move to CPU and set to eval mode
+        model = model.to(device)
         model.eval()
+        # Disable gradient computation
         torch.set_grad_enabled(False)
         # Store in session state
         st.session_state.tokenizer = tokenizer
         st.session_state.model = model
         print("✅ Model loaded successfully")
         return tokenizer, model
     except Exception as e:
         print(f"❌ Error loading model: {str(e)}")
+        # Return None values - don't raise exception
         return None, None
 def download_file_from_gcs(bucket, gcs_path, local_path):
     """Download a file from GCS to local storage."""
     try:
+        # Check if file already exists
         if os.path.exists(local_path):
             print(f"File already exists locally: {local_path}")
             return True
         return False
 def load_data_files():
+    """Load FAISS index, text chunks, and metadata"""
     # Check if already loaded in session state
+    if 'faiss_index' in st.session_state and st.session_state.faiss_index is not None:
         print("Using cached data files from session state")
         return st.session_state.faiss_index, st.session_state.text_chunks, st.session_state.metadata_dict
+    # Initialize clients
     bucket = setup_gcp_client()
     openai_initialized = setup_openai_client()
         print(f"❌ Error loading text chunks: {str(e)}")
         return None, None, None
+    # Load metadata
     try:
         metadata_dict = {}
         with open(local_metadata_file, "r", encoding="utf-8") as f:
             for line in f:
                 item = json.loads(line)
+                metadata_dict[item["Title"]] = item
     except Exception as e:
         print(f"❌ Error loading metadata: {str(e)}")
         return None, None, None
+    print(f"✅ Data loaded successfully: {len(text_chunks)} passages available")
     # Store in session state
     st.session_state.faiss_index = faiss_index
     st.session_state.text_chunks = text_chunks
     st.session_state.metadata_dict = metadata_dict
     return faiss_index, text_chunks, metadata_dict
     last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+# Cache for query embeddings
 query_embedding_cache = {}
 def get_embedding(text):
+    """Generate embeddings for a text query"""
+    # Check cache first
     if text in query_embedding_cache:
         return query_embedding_cache[text]
     try:
+        # Get model
+        if 'model' not in st.session_state or st.session_state.model is None:
             tokenizer, model = load_model()
         else:
             tokenizer, model = st.session_state.tokenizer, st.session_state.model
+        # Handle model load failure
+        if model is None:
+            print("Model is None, returning zero embedding")
+            return np.zeros((1, 384), dtype=np.float32)
         # Prepare text
         input_text = f"query: {text}" if len(text) < 512 else f"passage: {text}"
+        # Tokenize
         inputs = tokenizer(
             input_text,
             padding=True,
             return_attention_mask=True
         )
+        # Generate embeddings
         with torch.no_grad():
             outputs = model(**inputs)
             embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
             embeddings = nn.functional.normalize(embeddings, p=2, dim=1)
             embeddings = embeddings.detach().cpu().numpy()
+        # Clean up
         del outputs, inputs
         gc.collect()
+        # Cache and return
         query_embedding_cache[text] = embeddings
         return embeddings
     except Exception as e:
     """Retrieve top-k most relevant passages using FAISS with metadata."""
     try:
         print(f"\n🔍 Retrieving passages for query: {query}")
+        # Get query embedding
         query_embedding = get_embedding(query)
+        # Search in FAISS index
         distances, indices = faiss_index.search(query_embedding, top_k * 2)
         print(f"Found {len(distances[0])} potential matches")
         retrieved_sources = []
         cited_titles = set()
+        # Process results
         for dist, idx in zip(distances[0], indices[0]):
             print(f"Distance: {dist:.4f}, Index: {idx}")
             if idx in text_chunks and dist >= similarity_threshold:
                 title_with_txt, author, text = text_chunks[idx]
+                # Clean title
                 clean_title = title_with_txt.replace(".txt", "") if title_with_txt.endswith(".txt") else title_with_txt
                 clean_title = unicodedata.normalize("NFC", clean_title)
+                # Skip duplicates
                 if clean_title in cited_titles:
                     continue
+                # Get metadata
                 metadata_entry = metadata_dict.get(clean_title, {})
                 author = metadata_entry.get("Author", "Unknown")
                 publisher = metadata_entry.get("Publisher", "Unknown")
+                # Add to results
                 cited_titles.add(clean_title)
                 retrieved_passages.append(text)
                 retrieved_sources.append((clean_title, author, publisher))
+                # Stop if we have enough
                 if len(retrieved_passages) == top_k:
                     break
         return [], []
 def answer_with_llm(query, context=None, word_limit=100):
+    """Generate an answer using OpenAI GPT model with formatted citations."""
     try:
+        # Format context
         if context:
             formatted_contexts = []
             total_chars = 0
             "Ensure proper citation and do not include direct excerpts."
         )
+        # User message
         user_message = f"""
         Context:
         {formatted_context}
         {query}
         """
+        # Call OpenAI API
         response = openai.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[
             "citations": "No citations available."
         }
+    # Get relevant passages
     retrieved_context, retrieved_sources = retrieve_passages(
         query,
         faiss_index,
         top_k=top_k
     )
+    # Format citations
     sources = format_citations(retrieved_sources) if retrieved_sources else "No citation available."
+    # Generate answer
     if retrieved_context:
         context_with_sources = list(zip(retrieved_sources, retrieved_context))
         llm_answer_with_rag = answer_with_llm(query, context_with_sources, word_limit=word_limit)