Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -16,13 +16,13 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
16 |
import torch
|
17 |
import gradio as gr
|
18 |
|
19 |
-
torch.backends.cuda.matmul.allow_tf32 =
|
20 |
-
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction =
|
21 |
-
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction =
|
22 |
-
torch.backends.cudnn.allow_tf32 =
|
23 |
-
torch.backends.cudnn.deterministic =
|
24 |
-
torch.backends.cudnn.benchmark =
|
25 |
-
torch.set_float32_matmul_precision("
|
26 |
|
27 |
# --- Model and Tokenizer Configuration ---
|
28 |
model_name = "FelixChao/vicuna-33b-coder"
|
@@ -35,7 +35,7 @@ quantization_config_4bit = BitsAndBytesConfig(
|
|
35 |
load_in_4bit=True,
|
36 |
bnb_4bit_use_double_quant=True,
|
37 |
bnb_4bit_quant_type="nf4",
|
38 |
-
bnb_4bit_compute_dtype=torch.
|
39 |
)
|
40 |
|
41 |
print(f"Loading model: {model_name} with quantization")
|
@@ -119,7 +119,7 @@ def generate_code(prompt: str) -> str:
|
|
119 |
with torch.no_grad():
|
120 |
generated_ids = model.generate(
|
121 |
**model_inputs, # Pass tokenized inputs
|
122 |
-
max_new_tokens=
|
123 |
min_new_tokens=256,
|
124 |
do_sample=True,
|
125 |
temperature=0.7,
|
|
|
16 |
import torch
|
17 |
import gradio as gr
|
18 |
|
19 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
20 |
+
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
|
21 |
+
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
|
22 |
+
torch.backends.cudnn.allow_tf32 = True
|
23 |
+
torch.backends.cudnn.deterministic = True
|
24 |
+
torch.backends.cudnn.benchmark = True
|
25 |
+
torch.set_float32_matmul_precision("high")
|
26 |
|
27 |
# --- Model and Tokenizer Configuration ---
|
28 |
model_name = "FelixChao/vicuna-33b-coder"
|
|
|
35 |
load_in_4bit=True,
|
36 |
bnb_4bit_use_double_quant=True,
|
37 |
bnb_4bit_quant_type="nf4",
|
38 |
+
bnb_4bit_compute_dtype=torch.float16
|
39 |
)
|
40 |
|
41 |
print(f"Loading model: {model_name} with quantization")
|
|
|
119 |
with torch.no_grad():
|
120 |
generated_ids = model.generate(
|
121 |
**model_inputs, # Pass tokenized inputs
|
122 |
+
max_new_tokens=768,
|
123 |
min_new_tokens=256,
|
124 |
do_sample=True,
|
125 |
temperature=0.7,
|