Update app.py
Browse files
app.py
CHANGED
|
@@ -16,13 +16,13 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
| 16 |
import torch
|
| 17 |
import gradio as gr
|
| 18 |
|
| 19 |
-
torch.backends.cuda.matmul.allow_tf32 =
|
| 20 |
-
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction =
|
| 21 |
-
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction =
|
| 22 |
-
torch.backends.cudnn.allow_tf32 =
|
| 23 |
-
torch.backends.cudnn.deterministic =
|
| 24 |
-
torch.backends.cudnn.benchmark =
|
| 25 |
-
torch.set_float32_matmul_precision("
|
| 26 |
|
| 27 |
# --- Model and Tokenizer Configuration ---
|
| 28 |
model_name = "FelixChao/vicuna-33b-coder"
|
|
@@ -35,7 +35,7 @@ quantization_config_4bit = BitsAndBytesConfig(
|
|
| 35 |
load_in_4bit=True,
|
| 36 |
bnb_4bit_use_double_quant=True,
|
| 37 |
bnb_4bit_quant_type="nf4",
|
| 38 |
-
bnb_4bit_compute_dtype=torch.
|
| 39 |
)
|
| 40 |
|
| 41 |
print(f"Loading model: {model_name} with quantization")
|
|
@@ -119,7 +119,7 @@ def generate_code(prompt: str) -> str:
|
|
| 119 |
with torch.no_grad():
|
| 120 |
generated_ids = model.generate(
|
| 121 |
**model_inputs, # Pass tokenized inputs
|
| 122 |
-
max_new_tokens=
|
| 123 |
min_new_tokens=256,
|
| 124 |
do_sample=True,
|
| 125 |
temperature=0.7,
|
|
|
|
| 16 |
import torch
|
| 17 |
import gradio as gr
|
| 18 |
|
| 19 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 20 |
+
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
|
| 21 |
+
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
|
| 22 |
+
torch.backends.cudnn.allow_tf32 = True
|
| 23 |
+
torch.backends.cudnn.deterministic = True
|
| 24 |
+
torch.backends.cudnn.benchmark = True
|
| 25 |
+
torch.set_float32_matmul_precision("high")
|
| 26 |
|
| 27 |
# --- Model and Tokenizer Configuration ---
|
| 28 |
model_name = "FelixChao/vicuna-33b-coder"
|
|
|
|
| 35 |
load_in_4bit=True,
|
| 36 |
bnb_4bit_use_double_quant=True,
|
| 37 |
bnb_4bit_quant_type="nf4",
|
| 38 |
+
bnb_4bit_compute_dtype=torch.float16
|
| 39 |
)
|
| 40 |
|
| 41 |
print(f"Loading model: {model_name} with quantization")
|
|
|
|
| 119 |
with torch.no_grad():
|
| 120 |
generated_ids = model.generate(
|
| 121 |
**model_inputs, # Pass tokenized inputs
|
| 122 |
+
max_new_tokens=768,
|
| 123 |
min_new_tokens=256,
|
| 124 |
do_sample=True,
|
| 125 |
temperature=0.7,
|