Priyanshukr-1 commited on
Commit
a49d7b2
·
verified ·
1 Parent(s): 048628f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -66
app.py CHANGED
@@ -2,25 +2,22 @@ from fastapi import FastAPI, Request
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
-
6
  import platform
7
  import psutil
8
  import multiprocessing
9
  import time
10
 
11
-
12
-
13
  app = FastAPI()
14
 
15
  # === Model Config ===
16
- REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
17
- FILENAME = "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
18
  MODEL_DIR = "models"
19
  MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
20
 
21
- # === Download only if not already present ===
22
  if not os.path.exists(MODEL_PATH):
23
- print(f"Downloading model {FILENAME} from Hugging Face...")
24
  model_path = hf_hub_download(
25
  repo_id=REPO_ID,
26
  filename=FILENAME,
@@ -29,101 +26,86 @@ if not os.path.exists(MODEL_PATH):
29
  local_dir_use_symlinks=False
30
  )
31
  else:
32
- print(f"Model already exists at: {MODEL_PATH}")
33
  model_path = MODEL_PATH
34
 
35
- # === Load LLM ===
 
 
 
 
 
36
  llm = Llama(
37
  model_path=model_path,
38
- n_ctx=1024,
39
- n_threads=8, # Adjust for your CPU
40
- n_batch=32
 
 
 
 
41
  )
42
 
43
  @app.get("/")
44
  def root():
45
- return {"message": "Mistral API is live!"}
46
 
47
  @app.get("/get_sys")
48
  def get_sys_specs():
49
- cpu_info = {
50
- "physical_cores": psutil.cpu_count(logical=False),
51
- "logical_cores": psutil.cpu_count(logical=True),
52
- "max_frequency_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else None,
53
- "cpu_usage_percent": psutil.cpu_percent(interval=1)
54
- }
55
-
56
  memory = psutil.virtual_memory()
57
- ram_info = {
58
- "total_gb": round(memory.total / (1024 ** 3), 2),
59
- "available_gb": round(memory.available / (1024 ** 3), 2),
60
- "used_percent": memory.percent
61
- }
62
-
63
- system_info = {
64
- "system": platform.system(),
65
- "machine": platform.machine(),
66
- "platform": platform.platform(),
67
- "processor": platform.processor(),
68
- "python_version": platform.python_version(),
69
- }
70
-
71
  return {
72
- "cpu": cpu_info,
73
- "ram": ram_info,
74
- "system": system_info,
75
- "recommended_threads": min(psutil.cpu_count(logical=False) or 2, 8)
 
 
 
 
 
 
 
 
 
 
 
 
76
  }
77
 
78
  @app.get("/process_list")
79
  def process_list():
80
- p_l = []
81
-
82
- # Warm up CPU usage stats
83
- for proc in psutil.process_iter():
84
- try:
85
- proc.cpu_percent(interval=None) # Prime the value
86
- except (psutil.NoSuchProcess, psutil.AccessDenied):
87
- pass
88
-
89
- time.sleep(1) # Let CPU usage accumulate over time
90
-
91
  for proc in psutil.process_iter(['pid', 'name']):
92
  try:
93
- usage = proc.cpu_percent(interval=None)
94
- if usage > 10:
95
- p_l.append({
96
  "pid": proc.pid,
97
  "name": proc.name(),
98
- "cpu_percent": usage
99
  })
100
  except (psutil.NoSuchProcess, psutil.AccessDenied):
101
  pass
102
-
103
- return {
104
- "process list": p_l
105
- }
106
-
107
 
108
  @app.post("/generate")
109
  async def generate(request: Request):
110
  data = await request.json()
111
- prompt = data.get("prompt", "")
112
-
113
- print("🧾 Received prompt:", prompt)
114
 
115
  response = llm.create_chat_completion(
116
  messages=[
117
- {"role": "system", "content": "You are a helpful assistant."},
118
  {"role": "user", "content": prompt}
119
  ],
120
- max_tokens=1024,
121
  temperature=0.7,
 
122
  )
123
 
124
- print("📤 Raw model response:", response)
125
-
126
- llm.reset()
127
 
128
  return {
129
  "response": response["choices"][0]["message"]["content"].strip()
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
 
5
  import platform
6
  import psutil
7
  import multiprocessing
8
  import time
9
 
 
 
10
  app = FastAPI()
11
 
12
  # === Model Config ===
13
+ REPO_ID = "TheBloke/Hermes-2-Pro-Mistral-7B-GGUF"
14
+ FILENAME = "hermes-2-pro-mistral-7b.Q4_K_M.gguf"
15
  MODEL_DIR = "models"
16
  MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
17
 
18
+ # === Download if model not available ===
19
  if not os.path.exists(MODEL_PATH):
20
+ print(f"⬇️ Downloading {FILENAME} from Hugging Face...")
21
  model_path = hf_hub_download(
22
  repo_id=REPO_ID,
23
  filename=FILENAME,
 
26
  local_dir_use_symlinks=False
27
  )
28
  else:
29
+ print(f"Model already available at: {MODEL_PATH}")
30
  model_path = MODEL_PATH
31
 
32
+ # === Optimal thread usage ===
33
+ logical_cores = psutil.cpu_count(logical=True)
34
+ physical_cores = psutil.cpu_count(logical=False)
35
+ recommended_threads = min(physical_cores or 4, 8)
36
+
37
+ # === Load the model ===
38
  llm = Llama(
39
  model_path=model_path,
40
+ n_ctx=8192, # Can increase depending on memory
41
+ n_threads=recommended_threads,
42
+ n_batch=64, # adjust depending on RAM
43
+ use_mlock=True, # lock model in RAM for faster access
44
+ n_gpu_layers=0, # CPU only, use >0 if GPU is present
45
+ chat_format="chatml", # for Hermes 2
46
+ verbose=False
47
  )
48
 
49
  @app.get("/")
50
  def root():
51
+ return {"message": "✅ Hermes 2 Mistral API is live and optimized!"}
52
 
53
  @app.get("/get_sys")
54
  def get_sys_specs():
 
 
 
 
 
 
 
55
  memory = psutil.virtual_memory()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  return {
57
+ "CPU": {
58
+ "physical_cores": physical_cores,
59
+ "logical_cores": logical_cores,
60
+ "max_freq_mhz": psutil.cpu_freq().max,
61
+ "cpu_usage": psutil.cpu_percent(interval=1)
62
+ },
63
+ "RAM": {
64
+ "total_GB": round(memory.total / (1024 ** 3), 2),
65
+ "available_GB": round(memory.available / (1024 ** 3), 2),
66
+ "usage_percent": memory.percent
67
+ },
68
+ "System": {
69
+ "platform": platform.platform(),
70
+ "architecture": platform.machine(),
71
+ "python": platform.python_version()
72
+ }
73
  }
74
 
75
  @app.get("/process_list")
76
  def process_list():
77
+ time.sleep(1) # Let CPU settle
78
+ processes = []
 
 
 
 
 
 
 
 
 
79
  for proc in psutil.process_iter(['pid', 'name']):
80
  try:
81
+ cpu = proc.cpu_percent()
82
+ if cpu > 10:
83
+ processes.append({
84
  "pid": proc.pid,
85
  "name": proc.name(),
86
+ "cpu_percent": cpu
87
  })
88
  except (psutil.NoSuchProcess, psutil.AccessDenied):
89
  pass
90
+ return {"heavy_processes": processes}
 
 
 
 
91
 
92
  @app.post("/generate")
93
  async def generate(request: Request):
94
  data = await request.json()
95
+ prompt = data.get("prompt", "").strip()
96
+ print("🧾 Prompt received:", prompt)
 
97
 
98
  response = llm.create_chat_completion(
99
  messages=[
100
+ {"role": "system", "content": "You are a helpful AI assistant."},
101
  {"role": "user", "content": prompt}
102
  ],
103
+ max_tokens=2048,
104
  temperature=0.7,
105
+ stop=["</s>"]
106
  )
107
 
108
+ llm.reset() # Free memory after response
 
 
109
 
110
  return {
111
  "response": response["choices"][0]["message"]["content"].strip()