Priyanshukr-1 commited on
Commit
8a98314
·
verified ·
1 Parent(s): 6c97f34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -17
app.py CHANGED
@@ -7,6 +7,20 @@ import psutil
7
  import multiprocessing
8
  import time
9
  import tiktoken # For estimating token count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  app = FastAPI()
12
 
@@ -18,7 +32,7 @@ MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
18
 
19
  # === Download if model not available ===
20
  if not os.path.exists(MODEL_PATH):
21
- print(f"⬇️ Downloading {FILENAME} from Hugging Face...")
22
  try:
23
  model_path = hf_hub_download(
24
  repo_id=REPO_ID,
@@ -27,13 +41,13 @@ if not os.path.exists(MODEL_PATH):
27
  local_dir=MODEL_DIR,
28
  local_dir_use_symlinks=False
29
  )
30
- print(f"✅ Model downloaded to: {model_path}")
31
  except Exception as e:
32
- print(f"❌ Error downloading model: {e}")
33
  # Exit or handle error appropriately if model download fails
34
  exit(1)
35
  else:
36
- print(f"✅ Model already available at: {MODEL_PATH}")
37
  model_path = MODEL_PATH
38
 
39
  # === Optimal thread usage ===
@@ -41,8 +55,8 @@ logical_cores = psutil.cpu_count(logical=True)
41
  physical_cores = psutil.cpu_count(logical=False)
42
  recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
43
 
44
- print(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
45
- print(f"Using n_threads: {recommended_threads}")
46
 
47
  # === Load the model ===
48
  try:
@@ -53,18 +67,18 @@ try:
53
  use_mlock=True, # Lock model in RAM for faster access
54
  n_gpu_layers=0, # CPU only
55
  chat_format="chatml", # TinyLlama Chat uses ChatML format
56
- verbose=False
57
  )
58
- print("🚀 Llama model loaded successfully!")
59
  except Exception as e:
60
- print(f"❌ Error loading Llama model: {e}")
61
  exit(1)
62
 
63
  # Initialize tiktoken encoder for token counting
64
  try:
65
  encoding = tiktoken.get_encoding("cl100k_base")
66
  except Exception:
67
- print("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
68
  encoding = None
69
 
70
  def count_tokens_in_text(text):
@@ -75,14 +89,15 @@ def count_tokens_in_text(text):
75
  # Fallback for when tiktoken isn't available or for simple estimation
76
  return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
77
 
78
-
79
  @app.get("/")
80
  def root():
 
81
  return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
82
 
83
  @app.get("/get_sys")
84
  def get_sys_specs():
85
  """Returns system specifications including CPU, RAM, and OS details."""
 
86
  memory = psutil.virtual_memory()
87
  return {
88
  "CPU": {
@@ -112,6 +127,7 @@ def get_sys_specs():
112
  @app.get("/process_list")
113
  def process_list():
114
  """Returns a list of processes consuming significant CPU."""
 
115
  time.sleep(1) # Let CPU settle for accurate measurement
116
  processes = []
117
  for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
@@ -138,10 +154,12 @@ async def generate(request: Request):
138
  Generates a response from the LLM without retaining chat context.
139
  Expects a JSON body with 'prompt'.
140
  """
 
141
  data = await request.json()
142
  prompt = data.get("prompt", "").strip()
143
 
144
  if not prompt:
 
145
  return {"error": "Prompt cannot be empty"}, 400
146
 
147
  # Define the system prompt - sent with every request
@@ -160,8 +178,8 @@ async def generate(request: Request):
160
  # Calculate tokens in the user's prompt
161
  prompt_tokens = count_tokens_in_text(prompt)
162
 
163
- print(f"🧾 Prompt received: {prompt}")
164
- print(f"Tokens in prompt: {prompt_tokens}")
165
 
166
  try:
167
  response = llm.create_chat_completion(
@@ -170,13 +188,12 @@ async def generate(request: Request):
170
  temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
171
  stop=["</s>"] # Stop sequence for TinyLlama Chat
172
  )
173
-
174
  ai_response_content = response["choices"][0]["message"]["content"].strip()
175
-
176
  return {
177
  "response": ai_response_content,
178
  "prompt_tokens": prompt_tokens # Return tokens in the prompt
179
  }
180
  except Exception as e:
181
- print(f"❌ Error during generation: {e}")
182
- return {"error": f"Failed to generate response: {e}. Please try again."}, 500
 
7
  import multiprocessing
8
  import time
9
  import tiktoken # For estimating token count
10
+ import logging # Import the logging module
11
+
12
+ # === Configure Logging ===
13
+ # Get the root logger
14
+ logger = logging.getLogger(__name__)
15
+ # Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
16
+ logger.setLevel(logging.INFO)
17
+ # Create a console handler and set its format
18
+ handler = logging.StreamHandler()
19
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
20
+ handler.setFormatter(formatter)
21
+ # Add the handler to the logger if it's not already added
22
+ if not logger.handlers:
23
+ logger.addHandler(handler)
24
 
25
  app = FastAPI()
26
 
 
32
 
33
  # === Download if model not available ===
34
  if not os.path.exists(MODEL_PATH):
35
+ logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
36
  try:
37
  model_path = hf_hub_download(
38
  repo_id=REPO_ID,
 
41
  local_dir=MODEL_DIR,
42
  local_dir_use_symlinks=False
43
  )
44
+ logger.info(f"✅ Model downloaded to: {model_path}")
45
  except Exception as e:
46
+ logger.error(f"❌ Error downloading model: {e}")
47
  # Exit or handle error appropriately if model download fails
48
  exit(1)
49
  else:
50
+ logger.info(f"✅ Model already available at: {MODEL_PATH}")
51
  model_path = MODEL_PATH
52
 
53
  # === Optimal thread usage ===
 
55
  physical_cores = psutil.cpu_count(logical=False)
56
  recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
57
 
58
+ logger.info(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
59
+ logger.info(f"Using n_threads: {recommended_threads}")
60
 
61
  # === Load the model ===
62
  try:
 
67
  use_mlock=True, # Lock model in RAM for faster access
68
  n_gpu_layers=0, # CPU only
69
  chat_format="chatml", # TinyLlama Chat uses ChatML format
70
+ verbose=False # Keep llama.cpp's internal verbose logging off
71
  )
72
+ logger.info(" Llama model loaded successfully!")
73
  except Exception as e:
74
+ logger.error(f"❌ Error loading Llama model: {e}")
75
  exit(1)
76
 
77
  # Initialize tiktoken encoder for token counting
78
  try:
79
  encoding = tiktoken.get_encoding("cl100k_base")
80
  except Exception:
81
+ logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
82
  encoding = None
83
 
84
  def count_tokens_in_text(text):
 
89
  # Fallback for when tiktoken isn't available or for simple estimation
90
  return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
91
 
 
92
  @app.get("/")
93
  def root():
94
+ logger.info("Root endpoint accessed.")
95
  return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}
96
 
97
  @app.get("/get_sys")
98
  def get_sys_specs():
99
  """Returns system specifications including CPU, RAM, and OS details."""
100
+ logger.info("System specs endpoint accessed.")
101
  memory = psutil.virtual_memory()
102
  return {
103
  "CPU": {
 
127
  @app.get("/process_list")
128
  def process_list():
129
  """Returns a list of processes consuming significant CPU."""
130
+ logger.info("Process list endpoint accessed.")
131
  time.sleep(1) # Let CPU settle for accurate measurement
132
  processes = []
133
  for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
 
154
  Generates a response from the LLM without retaining chat context.
155
  Expects a JSON body with 'prompt'.
156
  """
157
+ logger.info("➡️ /generate endpoint received a request.") # Log at the very beginning
158
  data = await request.json()
159
  prompt = data.get("prompt", "").strip()
160
 
161
  if not prompt:
162
+ logger.warning("Prompt cannot be empty in /generate request.")
163
  return {"error": "Prompt cannot be empty"}, 400
164
 
165
  # Define the system prompt - sent with every request
 
178
  # Calculate tokens in the user's prompt
179
  prompt_tokens = count_tokens_in_text(prompt)
180
 
181
+ logger.info(f"🧾 Prompt received: {prompt}")
182
+ logger.info(f"Tokens in prompt: {prompt_tokens}")
183
 
184
  try:
185
  response = llm.create_chat_completion(
 
188
  temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
189
  stop=["</s>"] # Stop sequence for TinyLlama Chat
190
  )
 
191
  ai_response_content = response["choices"][0]["message"]["content"].strip()
192
+ logger.info("✅ Response generated successfully.")
193
  return {
194
  "response": ai_response_content,
195
  "prompt_tokens": prompt_tokens # Return tokens in the prompt
196
  }
197
  except Exception as e:
198
+ logger.error(f"❌ Error during generation: {e}", exc_info=True) # Log exception details
199
+ return {"error": f"Failed to generate response: {e}. Please try again."}, 500