import os # === CRITICAL: Set cache directories BEFORE any other imports === os.environ['HF_HOME'] = '/tmp/huggingface_cache' os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache' os.environ['HF_DATASETS_CACHE'] = '/tmp/datasets_cache' # Now import everything else import json import datetime import requests import gspread from dotenv import load_dotenv from huggingface_hub import login as hf_login from langchain_community.vectorstores import FAISS from langchain.embeddings.base import Embeddings from sentence_transformers import SentenceTransformer from langchain_tavily import TavilySearch from google.adk.tools import FunctionTool # === LOAD ENV === load_dotenv() HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN") GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") TAVILY_API_KEY = os.getenv("TAVILY_API_KEY") SERVICE_ACCOUNT_JSON = os.getenv("GOOGLE_SERVICE_ACCOUNT_JSON") SHEET_KEY = os.getenv("SHEET_KEY") PREDICTOR_API_URL = os.getenv("PREDICTOR_API_URL") PREDICTOR_API_KEY = os.getenv("PREDICTOR_API_KEY") hf_login(token=HF_TOKEN) # === GOOGLE SHEET LOGGING === service_account_dict = json.loads(SERVICE_ACCOUNT_JSON) if isinstance(SERVICE_ACCOUNT_JSON, str) else SERVICE_ACCOUNT_JSON def add_query_to_sheet(user_id: str, query: str, response: str): gc = gspread.service_account_from_dict(service_account_dict) sh = gc.open_by_key(SHEET_KEY) ws = sh.worksheet("Sheet1") timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") ws.append_row([user_id, timestamp, query, response]) # === VECTOR STORE === def load_vector_store(data_dir: str): texts = [] for fname in os.listdir(data_dir): if fname.lower().endswith(".md"): path = os.path.join(data_dir, fname) try: with open(path, "r", encoding="utf-8") as f: texts.append(f.read()) except UnicodeDecodeError: with open(path, "r", encoding="latin-1") as f: texts.append(f.read()) st_model = SentenceTransformer("all-MiniLM-L6-v2") class LocalEmbeddings(Embeddings): def embed_documents(self, docs): return st_model.encode(docs).tolist() def embed_query(self, q): return st_model.encode([q])[0].tolist() return FAISS.from_texts(texts, LocalEmbeddings()) vector_store = load_vector_store("College_markdown") # === TOOL DEFINITIONS === def db_search(query: str) -> dict: docs = vector_store.similarity_search(query, k=6) if not docs: return {"results": []} return {"results": [d.page_content for d in docs]} def tavily_search(query: str) -> dict: tool = TavilySearch(max_results=6, topic="general", include_raw_content=True) result = tool.invoke({"query": query}) snippets = [item.get('content') for item in result.get('results', [])] return {"results": snippets or []} def college_predictor( userCrl: int, userCategory: str, userGender: str, userHomeState: str, limit: int = 6, counsellingName: str = "josaa", collegeName: str = "national institute of technology", branchName: str = "computer science and engineering" ) -> str: headers = { "Content-Type": "application/json", "Authorization": f"Bearer {PREDICTOR_API_KEY}" } params = { "userCrl": userCrl, "userCategory": userCategory, "userGender": userGender, "userHomeState": userHomeState, "limit": limit, "counsellingName": counsellingName, } if collegeName: params["collegeQuery"] = collegeName if branchName: params["branchQuery"] = branchName try: response = requests.post(PREDICTOR_API_URL, json=params, headers=headers, timeout=30) response.raise_for_status() data = response.json() if not data or 'data' not in data or 'colleges' not in data['data']: return "No college predictions found with the given criteria." colleges = data['data']['colleges'] if not colleges: return "No college predictions found with the given criteria." results = [] for i, college in enumerate(colleges[:limit], start=1): parts = [f"{i}. College: {college.get('Institute', 'N/A')}"] if college.get('Academic_Program_Name'): parts.append(f"Branch: {college['Academic_Program_Name']}") if college.get('Seat_Type'): parts.append(f"Category: {college['Seat_Type']}") if college.get('Max_ClosingRank'): parts.append(f"Closing Rank: {college['Max_ClosingRank']}") results.append(", ".join(parts)) return f"Based on your rank {userCrl}, here are college predictions:\n\n" + "\n".join(results) except requests.exceptions.RequestException as e: return f"Error fetching college predictions: {str(e)}" def mentor_search(college_query: str) -> str: """Search mentors by college name and return formatted links.""" url = f"https://test.api.precollege.in/api/v1/mentor/search?q={college_query}" try: response = requests.get(url, timeout=10) response.raise_for_status() data = response.json() if not data or "data" not in data or not data["data"]: return f"No mentors found for '{college_query}'." mentors = data["data"] lines = [] for mentor in mentors: name = mentor.get("name", "Unknown") username = mentor.get("username", "") profile_url = f"https://www.precollege.in/mentor/{username}" if username else "No profile link" lines.append(f"{name}: {profile_url}") return f"Mentors for '{college_query}':\n\n" + "\n".join(lines) except requests.exceptions.RequestException as e: return f"Failed to fetch mentors: {str(e)}" # === FUNCTION TOOL WRAPPERS === db_tool = FunctionTool(db_search) tavily_tool = FunctionTool(tavily_search) predictor_tool = FunctionTool(college_predictor) mentor_tool = FunctionTool(mentor_search)