EvoPlatformV3 / inference.py
HemanM's picture
Update inference.py
8ed51aa verified
raw
history blame
3.04 kB
import os
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from evo_model import EvoTransformerV22
from search_utils import web_search
import openai
import time
# πŸ” Load OpenAI API Key securely
openai.api_key = os.getenv("OPENAI_API_KEY")
# πŸ” Track model changes
MODEL_PATH = "evo_hellaswag.pt"
last_mod_time = 0
model = None
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# πŸ“¦ Load model with auto-reload if file is updated
def load_model():
global model, last_mod_time
current_mod_time = os.path.getmtime(MODEL_PATH)
if model is None or current_mod_time > last_mod_time:
model = EvoTransformerV22()
model.load_state_dict(torch.load(MODEL_PATH, map_location="cpu"))
model.eval()
last_mod_time = current_mod_time
print("πŸ” Evo model reloaded.")
return model
# 🧠 Evo decision logic with confidence scores
def get_evo_response(query, options, user_context=""):
model = load_model()
# Retrieve RAG context + optional user input
context_texts = web_search(query) + ([user_context] if user_context else [])
context_str = "\n".join(context_texts)
input_pairs = [f"{query} [SEP] {opt} [CTX] {context_str}" for opt in options]
# Encode both options and compute scores
scores = []
for pair in input_pairs:
encoded = tokenizer(pair, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
with torch.no_grad():
logits = model(encoded["input_ids"])
score = torch.sigmoid(logits).item()
scores.append(score)
best_idx = int(scores[1] > scores[0])
return (
options[best_idx], # πŸ”Ή Selected answer
max(scores), # πŸ”Ή Confidence score
f"{options[0]}: {scores[0]:.3f} vs {options[1]}: {scores[1]:.3f}", # πŸ”Ή Reasoning trace
context_str # πŸ”Ή Context used
)
# πŸ€– GPT-3.5 backup or comparison
def get_gpt_response(query, user_context=""):
try:
context_block = f"\n\nContext:\n{user_context}" if user_context else ""
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": query + context_block}],
temperature=0.7,
)
return response.choices[0].message.content.strip()
except Exception as e:
return f"⚠️ GPT error:\n\n{str(e)}"
# βœ… Final callable interface
def infer(query, options, user_context=""):
return get_evo_response(query, options, user_context)
# 🧠 Unified chat-style interface for EvoRAG
def evo_chat_predict(history, query, options):
# Use the last few exchanges as context (up to 3 pairs)
context = "\n".join(history[-6:]) if history else ""
evo_ans, evo_score, evo_reason, evo_ctx = get_evo_response(query, options, context)
return {
"answer": evo_ans,
"confidence": round(evo_score, 3),
"reasoning": evo_reason,
"context_used": evo_ctx
}