Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,20 @@ import psutil
|
|
7 |
import multiprocessing
|
8 |
import time
|
9 |
import tiktoken # For estimating token count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
app = FastAPI()
|
12 |
|
@@ -18,7 +32,7 @@ MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
|
|
18 |
|
19 |
# === Download if model not available ===
|
20 |
if not os.path.exists(MODEL_PATH):
|
21 |
-
|
22 |
try:
|
23 |
model_path = hf_hub_download(
|
24 |
repo_id=REPO_ID,
|
@@ -27,13 +41,13 @@ if not os.path.exists(MODEL_PATH):
|
|
27 |
local_dir=MODEL_DIR,
|
28 |
local_dir_use_symlinks=False
|
29 |
)
|
30 |
-
|
31 |
except Exception as e:
|
32 |
-
|
33 |
# Exit or handle error appropriately if model download fails
|
34 |
exit(1)
|
35 |
else:
|
36 |
-
|
37 |
model_path = MODEL_PATH
|
38 |
|
39 |
# === Optimal thread usage ===
|
@@ -41,8 +55,8 @@ logical_cores = psutil.cpu_count(logical=True)
|
|
41 |
physical_cores = psutil.cpu_count(logical=False)
|
42 |
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
|
43 |
|
44 |
-
|
45 |
-
|
46 |
|
47 |
# === Load the model ===
|
48 |
try:
|
@@ -53,18 +67,18 @@ try:
|
|
53 |
use_mlock=True, # Lock model in RAM for faster access
|
54 |
n_gpu_layers=0, # CPU only
|
55 |
chat_format="chatml", # TinyLlama Chat uses ChatML format
|
56 |
-
verbose=False
|
57 |
)
|
58 |
-
|
59 |
except Exception as e:
|
60 |
-
|
61 |
exit(1)
|
62 |
|
63 |
# Initialize tiktoken encoder for token counting
|
64 |
try:
|
65 |
encoding = tiktoken.get_encoding("cl100k_base")
|
66 |
except Exception:
|
67 |
-
|
68 |
encoding = None
|
69 |
|
70 |
def count_tokens_in_text(text):
|
@@ -75,14 +89,15 @@ def count_tokens_in_text(text):
|
|
75 |
# Fallback for when tiktoken isn't available or for simple estimation
|
76 |
return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
|
77 |
|
78 |
-
|
79 |
@app.get("/")
|
80 |
def root():
|
|
|
81 |
return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
|
82 |
|
83 |
@app.get("/get_sys")
|
84 |
def get_sys_specs():
|
85 |
"""Returns system specifications including CPU, RAM, and OS details."""
|
|
|
86 |
memory = psutil.virtual_memory()
|
87 |
return {
|
88 |
"CPU": {
|
@@ -112,6 +127,7 @@ def get_sys_specs():
|
|
112 |
@app.get("/process_list")
|
113 |
def process_list():
|
114 |
"""Returns a list of processes consuming significant CPU."""
|
|
|
115 |
time.sleep(1) # Let CPU settle for accurate measurement
|
116 |
processes = []
|
117 |
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
|
@@ -138,10 +154,12 @@ async def generate(request: Request):
|
|
138 |
Generates a response from the LLM without retaining chat context.
|
139 |
Expects a JSON body with 'prompt'.
|
140 |
"""
|
|
|
141 |
data = await request.json()
|
142 |
prompt = data.get("prompt", "").strip()
|
143 |
|
144 |
if not prompt:
|
|
|
145 |
return {"error": "Prompt cannot be empty"}, 400
|
146 |
|
147 |
# Define the system prompt - sent with every request
|
@@ -160,8 +178,8 @@ async def generate(request: Request):
|
|
160 |
# Calculate tokens in the user's prompt
|
161 |
prompt_tokens = count_tokens_in_text(prompt)
|
162 |
|
163 |
-
|
164 |
-
|
165 |
|
166 |
try:
|
167 |
response = llm.create_chat_completion(
|
@@ -170,13 +188,12 @@ async def generate(request: Request):
|
|
170 |
temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
|
171 |
stop=["</s>"] # Stop sequence for TinyLlama Chat
|
172 |
)
|
173 |
-
|
174 |
ai_response_content = response["choices"][0]["message"]["content"].strip()
|
175 |
-
|
176 |
return {
|
177 |
"response": ai_response_content,
|
178 |
"prompt_tokens": prompt_tokens # Return tokens in the prompt
|
179 |
}
|
180 |
except Exception as e:
|
181 |
-
|
182 |
-
return {"error": f"Failed to generate response: {e}. Please try again."}, 500
|
|
|
7 |
import multiprocessing
|
8 |
import time
|
9 |
import tiktoken # For estimating token count
|
10 |
+
import logging # Import the logging module
|
11 |
+
|
12 |
+
# === Configure Logging ===
|
13 |
+
# Get the root logger
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
# Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
|
16 |
+
logger.setLevel(logging.INFO)
|
17 |
+
# Create a console handler and set its format
|
18 |
+
handler = logging.StreamHandler()
|
19 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
20 |
+
handler.setFormatter(formatter)
|
21 |
+
# Add the handler to the logger if it's not already added
|
22 |
+
if not logger.handlers:
|
23 |
+
logger.addHandler(handler)
|
24 |
|
25 |
app = FastAPI()
|
26 |
|
|
|
32 |
|
33 |
# === Download if model not available ===
|
34 |
if not os.path.exists(MODEL_PATH):
|
35 |
+
logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
|
36 |
try:
|
37 |
model_path = hf_hub_download(
|
38 |
repo_id=REPO_ID,
|
|
|
41 |
local_dir=MODEL_DIR,
|
42 |
local_dir_use_symlinks=False
|
43 |
)
|
44 |
+
logger.info(f"✅ Model downloaded to: {model_path}")
|
45 |
except Exception as e:
|
46 |
+
logger.error(f"❌ Error downloading model: {e}")
|
47 |
# Exit or handle error appropriately if model download fails
|
48 |
exit(1)
|
49 |
else:
|
50 |
+
logger.info(f"✅ Model already available at: {MODEL_PATH}")
|
51 |
model_path = MODEL_PATH
|
52 |
|
53 |
# === Optimal thread usage ===
|
|
|
55 |
physical_cores = psutil.cpu_count(logical=False)
|
56 |
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
|
57 |
|
58 |
+
logger.info(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
|
59 |
+
logger.info(f"Using n_threads: {recommended_threads}")
|
60 |
|
61 |
# === Load the model ===
|
62 |
try:
|
|
|
67 |
use_mlock=True, # Lock model in RAM for faster access
|
68 |
n_gpu_layers=0, # CPU only
|
69 |
chat_format="chatml", # TinyLlama Chat uses ChatML format
|
70 |
+
verbose=False # Keep llama.cpp's internal verbose logging off
|
71 |
)
|
72 |
+
logger.info("� Llama model loaded successfully!")
|
73 |
except Exception as e:
|
74 |
+
logger.error(f"❌ Error loading Llama model: {e}")
|
75 |
exit(1)
|
76 |
|
77 |
# Initialize tiktoken encoder for token counting
|
78 |
try:
|
79 |
encoding = tiktoken.get_encoding("cl100k_base")
|
80 |
except Exception:
|
81 |
+
logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
|
82 |
encoding = None
|
83 |
|
84 |
def count_tokens_in_text(text):
|
|
|
89 |
# Fallback for when tiktoken isn't available or for simple estimation
|
90 |
return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
|
91 |
|
|
|
92 |
@app.get("/")
|
93 |
def root():
|
94 |
+
logger.info("Root endpoint accessed.")
|
95 |
return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
|
96 |
|
97 |
@app.get("/get_sys")
|
98 |
def get_sys_specs():
|
99 |
"""Returns system specifications including CPU, RAM, and OS details."""
|
100 |
+
logger.info("System specs endpoint accessed.")
|
101 |
memory = psutil.virtual_memory()
|
102 |
return {
|
103 |
"CPU": {
|
|
|
127 |
@app.get("/process_list")
|
128 |
def process_list():
|
129 |
"""Returns a list of processes consuming significant CPU."""
|
130 |
+
logger.info("Process list endpoint accessed.")
|
131 |
time.sleep(1) # Let CPU settle for accurate measurement
|
132 |
processes = []
|
133 |
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
|
|
|
154 |
Generates a response from the LLM without retaining chat context.
|
155 |
Expects a JSON body with 'prompt'.
|
156 |
"""
|
157 |
+
logger.info("➡️ /generate endpoint received a request.") # Log at the very beginning
|
158 |
data = await request.json()
|
159 |
prompt = data.get("prompt", "").strip()
|
160 |
|
161 |
if not prompt:
|
162 |
+
logger.warning("Prompt cannot be empty in /generate request.")
|
163 |
return {"error": "Prompt cannot be empty"}, 400
|
164 |
|
165 |
# Define the system prompt - sent with every request
|
|
|
178 |
# Calculate tokens in the user's prompt
|
179 |
prompt_tokens = count_tokens_in_text(prompt)
|
180 |
|
181 |
+
logger.info(f"🧾 Prompt received: {prompt}")
|
182 |
+
logger.info(f"Tokens in prompt: {prompt_tokens}")
|
183 |
|
184 |
try:
|
185 |
response = llm.create_chat_completion(
|
|
|
188 |
temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
|
189 |
stop=["</s>"] # Stop sequence for TinyLlama Chat
|
190 |
)
|
|
|
191 |
ai_response_content = response["choices"][0]["message"]["content"].strip()
|
192 |
+
logger.info("✅ Response generated successfully.")
|
193 |
return {
|
194 |
"response": ai_response_content,
|
195 |
"prompt_tokens": prompt_tokens # Return tokens in the prompt
|
196 |
}
|
197 |
except Exception as e:
|
198 |
+
logger.error(f"❌ Error during generation: {e}", exc_info=True) # Log exception details
|
199 |
+
return {"error": f"Failed to generate response: {e}. Please try again."}, 500
|