Update app.py
Browse files
app.py
CHANGED
@@ -10,8 +10,8 @@ import time
|
|
10 |
app = FastAPI()
|
11 |
|
12 |
# === Model Config ===
|
13 |
-
REPO_ID = "TheBloke/
|
14 |
-
FILENAME = "
|
15 |
MODEL_DIR = "models"
|
16 |
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
|
17 |
|
@@ -39,7 +39,7 @@ llm = Llama(
|
|
39 |
model_path=model_path,
|
40 |
n_ctx=8192, # Can increase depending on memory
|
41 |
n_threads=recommended_threads,
|
42 |
-
n_batch=
|
43 |
use_mlock=True, # lock model in RAM for faster access
|
44 |
n_gpu_layers=0, # CPU only, use >0 if GPU is present
|
45 |
chat_format="chatml", # for Hermes 2
|
|
|
10 |
app = FastAPI()
|
11 |
|
12 |
# === Model Config ===
|
13 |
+
REPO_ID = "TheBloke/phi-2-GGUF"
|
14 |
+
FILENAME = "phi-2.Q4_K_M.gguf"
|
15 |
MODEL_DIR = "models"
|
16 |
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
|
17 |
|
|
|
39 |
model_path=model_path,
|
40 |
n_ctx=8192, # Can increase depending on memory
|
41 |
n_threads=recommended_threads,
|
42 |
+
n_batch=32, # adjust depending on RAM
|
43 |
use_mlock=True, # lock model in RAM for faster access
|
44 |
n_gpu_layers=0, # CPU only, use >0 if GPU is present
|
45 |
chat_format="chatml", # for Hermes 2
|