Spaces:

ankanghosh
/

anveshak

Running

App Files Files Community

ankanghosh commited on Mar 19

Commit

62d1e75

verified ·

1 Parent(s): b7d14dc

Upload application files.

Browse files

Files changed (3) hide show

rag_engine.py +287 -0
requirements.txt +6 -0
utils.py +95 -0

rag_engine.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import os
+import json
+import numpy as np
+import faiss
+import torch
+import torch.nn as nn
+from google.cloud import storage
+from transformers import AutoTokenizer, AutoModel
+import openai
+import textwrap
+import unicodedata
+import streamlit as st
+from utils import setup_gcp_auth, setup_openai_auth
+# Initialize session state for model and tokenizer
+if 'model' not in st.session_state:
+    st.session_state.model = None
+if 'tokenizer' not in st.session_state:
+    st.session_state.tokenizer = None
+if 'device' not in st.session_state:
+    st.session_state.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {st.session_state.device}")
+# Load GCP authentication from utility function
+try:
+    credentials = setup_gcp_auth()
+    storage_client = storage.Client(credentials=credentials)
+    bucket_name = "indian_spiritual-1"
+    bucket = storage_client.bucket(bucket_name)
+    print("✅ GCP client initialized successfully")
+except Exception as e:
+    print(f"❌ GCP client initialization error: {str(e)}")
+    raise
+# Setup OpenAI authentication
+try:
+    setup_openai_auth()
+    print("✅ OpenAI client initialized successfully")
+except Exception as e:
+    print(f"❌ OpenAI client initialization error: {str(e)}")
+    raise
+# GCS Paths
+metadata_file_gcs = "metadata/metadata.jsonl"
+embeddings_file_gcs = "processed/embeddings/all_embeddings.npy"
+faiss_index_file_gcs = "processed/indices/faiss_index.faiss"
+text_chunks_file_gcs = "processed/chunks/text_chunks.txt"
+# Local Paths
+local_embeddings_file = "all_embeddings.npy"
+local_faiss_index_file = "faiss_index.faiss"
+local_text_chunks_file = "text_chunks.txt"
+local_metadata_file = "metadata.jsonl"
+def load_model():
+    try:
+        if st.session_state.model is None:
+            # Force model to CPU - more stable than GPU for this use case
+            os.environ["CUDA_VISIBLE_DEVICES"] = ""
+            print("Loading tokenizer...")
+            tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-small-v2")
+            print("Loading model...")
+            model = AutoModel.from_pretrained(
+                "intfloat/e5-small-v2",
+                torch_dtype=torch.float16,  # Use half precision
+                low_cpu_mem_usage=True,
+                device_map="auto"  # Let transformers decide
+            )
+            model.eval()
+            torch.set_grad_enabled(False)
+            st.session_state.tokenizer = tokenizer
+            st.session_state.model = model
+            print("✅ Model loaded successfully")
+        return st.session_state.tokenizer, st.session_state.model
+    except Exception as e:
+        print(f"❌ Error loading model: {str(e)}")
+        raise
+def download_file_from_gcs(gcs_path, local_path):
+    """Download a file from GCS to local storage."""
+    blob = bucket.blob(gcs_path)
+    blob.download_to_filename(local_path)
+    print(f"✅ Downloaded {gcs_path} → {local_path}")
+# Download necessary files
+download_file_from_gcs(faiss_index_file_gcs, local_faiss_index_file)
+download_file_from_gcs(text_chunks_file_gcs, local_text_chunks_file)
+download_file_from_gcs(metadata_file_gcs, local_metadata_file)
+# Load FAISS index
+faiss_index = faiss.read_index(local_faiss_index_file)
+# Load text chunks
+text_chunks = {}  # {ID -> (Title, Author, Text)}
+with open(local_text_chunks_file, "r", encoding="utf-8") as f:
+    for line in f:
+        parts = line.strip().split("\t")
+        if len(parts) == 4:
+            text_chunks[int(parts[0])] = (parts[1], parts[2], parts[3])
+# Load metadata.jsonl for publisher information
+metadata_dict = {}
+with open(local_metadata_file, "r", encoding="utf-8") as f:
+    for line in f:
+        item = json.loads(line)
+        metadata_dict[item["Title"]] = item  # Store for easy lookup
+print(f"✅ FAISS index and text chunks loaded. {len(text_chunks)} passages available.")
+def average_pool(last_hidden_states, attention_mask):
+    """Average pooling for sentence embeddings."""
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+query_embedding_cache = {}
+def get_embedding(text):
+    if text in query_embedding_cache:
+        return query_embedding_cache[text]
+    try:
+        tokenizer, model = load_model()
+        input_text = f"query: {text}" if len(text) < 512 else f"passage: {text}"
+        inputs = tokenizer(
+            input_text,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+            max_length=512,
+            return_attention_mask=True
+        )
+        # Move to CPU explicitly before processing
+        inputs = {k: v.to('cpu') for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs)
+            embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
+            embeddings = nn.functional.normalize(embeddings, p=2, dim=1)
+            # Ensure we detach and move to numpy on CPU
+            embeddings = embeddings.detach().cpu().numpy()
+        # Explicitly clean up
+        del outputs
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        query_embedding_cache[text] = embeddings
+        return embeddings
+    except Exception as e:
+        print(f"❌ Embedding error: {str(e)}")
+        return np.zeros((1, 1024), dtype=np.float32)
+def retrieve_passages(query, top_k=5, similarity_threshold=0.5):
+    """Retrieve top-k most relevant passages using FAISS with metadata."""
+    try:
+        print(f"\n🔍 Retrieving passages for query: {query}")
+        query_embedding = get_embedding(query)
+        distances, indices = faiss_index.search(query_embedding, top_k * 2)
+        print(f"Found {len(distances[0])} potential matches")
+        retrieved_passages = []
+        retrieved_sources = []
+        cited_titles = set()
+        for dist, idx in zip(distances[0], indices[0]):
+            print(f"Distance: {dist:.4f}, Index: {idx}")
+            if idx in text_chunks and dist >= similarity_threshold:
+                title_with_txt, author, text = text_chunks[idx]
+                # Normalize title and remove .txt
+                clean_title = title_with_txt.replace(".txt", "") if title_with_txt.endswith(".txt") else title_with_txt
+                clean_title = unicodedata.normalize("NFC", clean_title)
+                # Ensure unique citations
+                if clean_title in cited_titles:
+                    continue
+                metadata_entry = metadata_dict.get(clean_title, {})
+                author = metadata_entry.get("Author", "Unknown")
+                publisher = metadata_entry.get("Publisher", "Unknown")
+                cited_titles.add(clean_title)
+                retrieved_passages.append(text)
+                retrieved_sources.append((clean_title, author, publisher))
+                if len(retrieved_passages) == top_k:
+                    break
+        print(f"Retrieved {len(retrieved_passages)} passages")
+        return retrieved_passages, retrieved_sources
+    except Exception as e:
+        print(f"❌ Error in retrieve_passages: {str(e)}")
+        return [], []
+def answer_with_llm(query, context=None, word_limit=100):
+    """
+    Generate an answer using OpenAI GPT model with formatted citations.
+    """
+    try:
+        if context:
+            formatted_contexts = []
+            total_chars = 0
+            max_context_chars = 4000
+            for (title, author, publisher), text in context:
+                remaining_space = max(0, max_context_chars - total_chars)
+                excerpt_len = min(150, remaining_space)
+                if excerpt_len > 50:
+                    excerpt = text[:excerpt_len].strip() + "..." if len(text) > excerpt_len else text
+                    formatted_context = f"[{title} by {author}, Published by {publisher}] {excerpt}"
+                    formatted_contexts.append(formatted_context)
+                    total_chars += len(formatted_context)
+                if total_chars >= max_context_chars:
+                    break
+            formatted_context = "\n".join(formatted_contexts)
+        else:
+            formatted_context = "No relevant information available."
+        # System message
+        system_message = (
+            "You are an AI specialized in Indian spiritual texts. "
+            "Answer based on context, summarizing ideas rather than quoting verbatim. "
+            "Ensure proper citation and do not include direct excerpts."
+        )
+        user_message = f"""
+        Context:
+        {formatted_context}
+        Question:
+        {query}
+        """
+        response = openai.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": system_message},
+                {"role": "user", "content": user_message}
+            ],
+            max_tokens=200,
+            temperature=0.7
+        )
+        answer = response.choices[0].message.content.strip()
+        # Enforce word limit
+        words = answer.split()
+        if len(words) > word_limit:
+            answer = " ".join(words[:word_limit])
+            if not answer.endswith((".", "!", "?")):
+                answer += "."
+        return answer
+    except Exception as e:
+        print(f"❌ LLM API error: {str(e)}")
+        return "I apologize, but I'm unable to answer at the moment."
+def process_query(query, top_k=5, word_limit=100):
+    """Process a query through the RAG pipeline with proper formatting."""
+    print(f"\n🔍 Processing query: {query}")
+    retrieved_context, retrieved_sources = retrieve_passages(query, top_k=top_k)
+    sources = format_citations(retrieved_sources) if retrieved_sources else "No citation available."
+    if retrieved_context:
+        context_with_sources = list(zip(retrieved_sources, retrieved_context))
+        llm_answer_with_rag = answer_with_llm(query, context_with_sources, word_limit=word_limit)
+    else:
+        llm_answer_with_rag = "⚠️ No relevant context found."
+    return {"query": query, "answer_with_rag": llm_answer_with_rag, "citations": sources}
+def format_citations(sources):
+    """Format citations to display each one on a new line."""
+    return "\n".join([f"📚 {title} by {author}, Published by {publisher}" for title, author, publisher in sources])

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+faiss-cpu==1.10.0
+transformers==4.38.2
+openai==1.14.1
+google-cloud-storage==2.14.0
+google-auth>=2.28.1
+streamlit>=1.32.0

utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import json
+from google.oauth2 import service_account
+import streamlit as st
+import openai
+def setup_gcp_auth():
+    """Setup GCP authentication from HF Spaces, environment variables, or Streamlit secrets."""
+    try:
+        # Option 1: HF Spaces environment variable
+        if "GCP_CREDENTIALS" in os.environ:
+            gcp_credentials = json.loads(os.getenv("GCP_CREDENTIALS"))
+            print("✅ Using GCP credentials from HF Spaces environment variable")
+            credentials = service_account.Credentials.from_service_account_info(gcp_credentials)
+            return credentials
+        # Option 2: Local environment variable pointing to file
+        elif "GOOGLE_APPLICATION_CREDENTIALS" in os.environ:
+            credentials_path = os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
+            print(f"✅ Using GCP credentials from file at {credentials_path}")
+            credentials = service_account.Credentials.from_service_account_file(credentials_path)
+            return credentials
+        # Option 3: Streamlit secrets
+        elif "gcp_credentials" in st.secrets:
+            gcp_credentials = st.secrets["gcp_credentials"]
+            # Handle different secret formats
+            if isinstance(gcp_credentials, dict) or hasattr(gcp_credentials, 'to_dict'):
+                # Convert AttrDict to dict if needed
+                if hasattr(gcp_credentials, 'to_dict'):
+                    gcp_credentials = gcp_credentials.to_dict()
+                print("✅ Using GCP credentials from Streamlit secrets (dict format)")
+                credentials = service_account.Credentials.from_service_account_info(gcp_credentials)
+                return credentials
+            else:
+                # Assume it's a JSON string
+                try:
+                    gcp_credentials_dict = json.loads(gcp_credentials)
+                    print("✅ Using GCP credentials from Streamlit secrets (JSON string)")
+                    credentials = service_account.Credentials.from_service_account_info(gcp_credentials_dict)
+                    return credentials
+                except json.JSONDecodeError:
+                    print("⚠️ GCP credentials in Streamlit secrets is not valid JSON, trying as file path")
+                    if os.path.exists(gcp_credentials):
+                        credentials = service_account.Credentials.from_service_account_file(gcp_credentials)
+                        return credentials
+                    else:
+                        raise ValueError("GCP credentials format not recognized")
+        else:
+            raise ValueError("No GCP credentials found in environment or Streamlit secrets")
+    except Exception as e:
+        error_msg = f"❌ Authentication error: {str(e)}"
+        print(error_msg)
+        st.error(error_msg)
+        raise
+def setup_openai_auth():
+    """Setup OpenAI API authentication from environment variables or Streamlit secrets."""
+    try:
+        # Option 1: Standard environment variable
+        if "OPENAI_API_KEY" in os.environ:
+            openai.api_key = os.getenv("OPENAI_API_KEY")
+            print("✅ Using OpenAI API key from environment variable")
+            return
+        # Option 2: HF Spaces environment variable with different name
+        elif "OPENAI_KEY" in os.environ:
+            openai.api_key = os.getenv("OPENAI_KEY")
+            print("✅ Using OpenAI API key from HF Spaces environment variable")
+            return
+        # Option 3: Streamlit secrets
+        elif "openai_api_key" in st.secrets:
+            openai.api_key = st.secrets["openai_api_key"]
+            print("✅ Using OpenAI API key from Streamlit secrets")
+            return
+        else:
+            raise ValueError("No OpenAI API key found in environment or Streamlit secrets")
+    except Exception as e:
+        error_msg = f"❌ OpenAI authentication error: {str(e)}"
+        print(error_msg)
+        st.error(error_msg)
+        raise
+def setup_all_auth():
+    """Setup all authentication in one call"""
+    gcp_creds = setup_gcp_auth()
+    setup_openai_auth()
+    return gcp_creds