Spaces:
Paused
Paused
Commit
·
4b8d66c
1
Parent(s):
24eb0d4
Update app_v4.py
Browse files
app_v4.py
CHANGED
|
@@ -33,15 +33,20 @@ if device == "cuda:0":
|
|
| 33 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
|
| 34 |
|
| 35 |
# Attempt to load the model, catch any OOM errors
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
pretrained_model_dir,
|
| 40 |
model_basename="Jackson2-4bit-128g-GPTQ",
|
| 41 |
use_safetensors=True,
|
| 42 |
device=device
|
| 43 |
)
|
| 44 |
model.eval() # Set the model to inference mode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
model_loaded = True
|
| 46 |
except RuntimeError as e:
|
| 47 |
if 'CUDA out of memory' in str(e):
|
|
|
|
| 33 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
|
| 34 |
|
| 35 |
# Attempt to load the model, catch any OOM errors
|
| 36 |
+
@st.cache_data
|
| 37 |
+
def load_gptq_model():
|
| 38 |
+
AutoGPTQForCausalLM.from_quantized(
|
| 39 |
pretrained_model_dir,
|
| 40 |
model_basename="Jackson2-4bit-128g-GPTQ",
|
| 41 |
use_safetensors=True,
|
| 42 |
device=device
|
| 43 |
)
|
| 44 |
model.eval() # Set the model to inference mode
|
| 45 |
+
return model
|
| 46 |
+
|
| 47 |
+
model_loaded = False
|
| 48 |
+
try:
|
| 49 |
+
model = load_gptq_model()
|
| 50 |
model_loaded = True
|
| 51 |
except RuntimeError as e:
|
| 52 |
if 'CUDA out of memory' in str(e):
|