Spaces:
Sleeping
Sleeping
Update inference.py
Browse files- inference.py +33 -47
inference.py
CHANGED
@@ -5,11 +5,10 @@ from transformers import AutoTokenizer
|
|
5 |
from evo_model import EvoTransformerV22
|
6 |
from search_utils import web_search
|
7 |
import openai
|
8 |
-
import time
|
9 |
import psutil
|
10 |
import platform
|
11 |
|
12 |
-
# π Load OpenAI API Key
|
13 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
14 |
|
15 |
# π¦ Constants
|
@@ -18,7 +17,7 @@ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
|
18 |
model = None
|
19 |
last_mod_time = 0
|
20 |
|
21 |
-
# π
|
22 |
def load_model():
|
23 |
global model, last_mod_time
|
24 |
try:
|
@@ -28,86 +27,73 @@ def load_model():
|
|
28 |
model.load_state_dict(torch.load(MODEL_PATH, map_location="cpu"))
|
29 |
model.eval()
|
30 |
last_mod_time = current_mod_time
|
31 |
-
print("
|
32 |
except Exception as e:
|
33 |
-
print(f"β Error loading
|
34 |
model = None
|
35 |
return model
|
36 |
|
37 |
-
# π§ Evo
|
38 |
-
def
|
39 |
model = load_model()
|
40 |
if model is None:
|
41 |
-
return "
|
42 |
|
43 |
-
#
|
44 |
-
|
45 |
-
|
46 |
-
input_pairs = [f"{query} [SEP] {opt} [CTX] {context_str}" for opt in options]
|
47 |
|
48 |
-
#
|
|
|
49 |
scores = []
|
50 |
-
|
|
|
51 |
encoded = tokenizer(pair, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
|
52 |
with torch.no_grad():
|
53 |
logits = model(encoded["input_ids"])
|
54 |
score = torch.sigmoid(logits).item()
|
55 |
scores.append(score)
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
f"{options[0]}: {scores[0]:.3f} vs {options[1]}: {scores[1]:.3f}", # β
Reasoning trace
|
62 |
-
context_str # β
Used context
|
63 |
-
)
|
64 |
|
65 |
-
# π GPT
|
66 |
-
def
|
67 |
try:
|
68 |
-
|
69 |
response = openai.chat.completions.create(
|
70 |
model="gpt-3.5-turbo",
|
71 |
-
messages=[{"role": "user", "content":
|
72 |
temperature=0.7,
|
73 |
)
|
74 |
return response.choices[0].message.content.strip()
|
75 |
except Exception as e:
|
76 |
-
return f"β οΈ GPT error
|
77 |
-
|
78 |
-
# π― For EvoRAG app UI
|
79 |
-
def evo_chat_predict(history, query, options):
|
80 |
-
context = "\n".join(history[-6:]) if history else ""
|
81 |
-
evo_ans, evo_score, evo_reason, evo_ctx = get_evo_response(query, options, context)
|
82 |
-
return {
|
83 |
-
"answer": evo_ans,
|
84 |
-
"confidence": round(evo_score, 3),
|
85 |
-
"reasoning": evo_reason,
|
86 |
-
"context_used": evo_ctx
|
87 |
-
}
|
88 |
|
89 |
-
# π Evo architecture
|
90 |
def get_model_config():
|
91 |
return {
|
92 |
"num_layers": 6,
|
93 |
"num_heads": 8,
|
94 |
"ffn_dim": 1024,
|
95 |
"memory_enabled": True,
|
96 |
-
"
|
97 |
}
|
98 |
|
99 |
-
# π»
|
100 |
def get_system_stats():
|
101 |
-
|
102 |
-
|
103 |
|
104 |
return {
|
105 |
"device": "GPU" if torch.cuda.is_available() else "CPU",
|
106 |
"cpu_usage_percent": psutil.cpu_percent(),
|
107 |
-
"memory_used_gb": round(
|
108 |
-
"memory_total_gb": round(
|
109 |
-
"gpu_name":
|
110 |
-
"gpu_memory_total_gb": round(
|
111 |
-
"gpu_memory_used_gb": round(torch.cuda.memory_allocated() / (1024 ** 3), 2) if
|
112 |
"platform": platform.platform()
|
113 |
}
|
|
|
5 |
from evo_model import EvoTransformerV22
|
6 |
from search_utils import web_search
|
7 |
import openai
|
|
|
8 |
import psutil
|
9 |
import platform
|
10 |
|
11 |
+
# π Load OpenAI API Key
|
12 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
13 |
|
14 |
# π¦ Constants
|
|
|
17 |
model = None
|
18 |
last_mod_time = 0
|
19 |
|
20 |
+
# π Load or reload Evo model
|
21 |
def load_model():
|
22 |
global model, last_mod_time
|
23 |
try:
|
|
|
27 |
model.load_state_dict(torch.load(MODEL_PATH, map_location="cpu"))
|
28 |
model.eval()
|
29 |
last_mod_time = current_mod_time
|
30 |
+
print("β
Evo model loaded.")
|
31 |
except Exception as e:
|
32 |
+
print(f"β Error loading model: {e}")
|
33 |
model = None
|
34 |
return model
|
35 |
|
36 |
+
# π§ Evo response engine
|
37 |
+
def evo_infer(question, option1, option2, user_context=""):
|
38 |
model = load_model()
|
39 |
if model is None:
|
40 |
+
return "Evo failed", 0.0, "Model not loaded", ""
|
41 |
|
42 |
+
# π Retrieve context
|
43 |
+
context_blobs = web_search(question) + ([user_context] if user_context else [])
|
44 |
+
context = "\n".join(context_blobs)
|
|
|
45 |
|
46 |
+
# β Format input pairs
|
47 |
+
inputs = [f"{question} [SEP] {opt} [CTX] {context}" for opt in [option1, option2]]
|
48 |
scores = []
|
49 |
+
|
50 |
+
for pair in inputs:
|
51 |
encoded = tokenizer(pair, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
|
52 |
with torch.no_grad():
|
53 |
logits = model(encoded["input_ids"])
|
54 |
score = torch.sigmoid(logits).item()
|
55 |
scores.append(score)
|
56 |
|
57 |
+
best = option2 if scores[1] > scores[0] else option1
|
58 |
+
trace = f"{option1}: {scores[0]:.3f} vs {option2}: {scores[1]:.3f}"
|
59 |
+
|
60 |
+
return best, max(scores), trace, context
|
|
|
|
|
|
|
61 |
|
62 |
+
# π GPT backup
|
63 |
+
def gpt_infer(question, user_context=""):
|
64 |
try:
|
65 |
+
block = f"\n\nContext:\n{user_context}" if user_context else ""
|
66 |
response = openai.chat.completions.create(
|
67 |
model="gpt-3.5-turbo",
|
68 |
+
messages=[{"role": "user", "content": question + block}],
|
69 |
temperature=0.7,
|
70 |
)
|
71 |
return response.choices[0].message.content.strip()
|
72 |
except Exception as e:
|
73 |
+
return f"β οΈ GPT error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
+
# π Evo architecture info
|
76 |
def get_model_config():
|
77 |
return {
|
78 |
"num_layers": 6,
|
79 |
"num_heads": 8,
|
80 |
"ffn_dim": 1024,
|
81 |
"memory_enabled": True,
|
82 |
+
"total_params": sum(p.numel() for p in model.parameters()) if model else "N/A"
|
83 |
}
|
84 |
|
85 |
+
# π» System stats
|
86 |
def get_system_stats():
|
87 |
+
gpu = torch.cuda.get_device_properties(0) if torch.cuda.is_available() else None
|
88 |
+
mem = psutil.virtual_memory()
|
89 |
|
90 |
return {
|
91 |
"device": "GPU" if torch.cuda.is_available() else "CPU",
|
92 |
"cpu_usage_percent": psutil.cpu_percent(),
|
93 |
+
"memory_used_gb": round(mem.used / (1024 ** 3), 2),
|
94 |
+
"memory_total_gb": round(mem.total / (1024 ** 3), 2),
|
95 |
+
"gpu_name": gpu.name if gpu else "N/A",
|
96 |
+
"gpu_memory_total_gb": round(gpu.total_memory / (1024 ** 3), 2) if gpu else "N/A",
|
97 |
+
"gpu_memory_used_gb": round(torch.cuda.memory_allocated() / (1024 ** 3), 2) if gpu else "N/A",
|
98 |
"platform": platform.platform()
|
99 |
}
|