1inkusFace commited on
Commit
140e826
·
verified ·
1 Parent(s): 72f8b83

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -16,13 +16,13 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
16
  import torch
17
  import gradio as gr
18
 
19
- torch.backends.cuda.matmul.allow_tf32 = False
20
- torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
21
- torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
22
- torch.backends.cudnn.allow_tf32 = False
23
- torch.backends.cudnn.deterministic = False
24
- torch.backends.cudnn.benchmark = False
25
- torch.set_float32_matmul_precision("highest")
26
 
27
  # --- Model and Tokenizer Configuration ---
28
  model_name = "FelixChao/vicuna-33b-coder"
@@ -35,7 +35,7 @@ quantization_config_4bit = BitsAndBytesConfig(
35
  load_in_4bit=True,
36
  bnb_4bit_use_double_quant=True,
37
  bnb_4bit_quant_type="nf4",
38
- bnb_4bit_compute_dtype=torch.bfloat16
39
  )
40
 
41
  print(f"Loading model: {model_name} with quantization")
@@ -119,7 +119,7 @@ def generate_code(prompt: str) -> str:
119
  with torch.no_grad():
120
  generated_ids = model.generate(
121
  **model_inputs, # Pass tokenized inputs
122
- max_new_tokens=1024,
123
  min_new_tokens=256,
124
  do_sample=True,
125
  temperature=0.7,
 
16
  import torch
17
  import gradio as gr
18
 
19
+ torch.backends.cuda.matmul.allow_tf32 = True
20
+ torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
21
+ torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
22
+ torch.backends.cudnn.allow_tf32 = True
23
+ torch.backends.cudnn.deterministic = True
24
+ torch.backends.cudnn.benchmark = True
25
+ torch.set_float32_matmul_precision("high")
26
 
27
  # --- Model and Tokenizer Configuration ---
28
  model_name = "FelixChao/vicuna-33b-coder"
 
35
  load_in_4bit=True,
36
  bnb_4bit_use_double_quant=True,
37
  bnb_4bit_quant_type="nf4",
38
+ bnb_4bit_compute_dtype=torch.float16
39
  )
40
 
41
  print(f"Loading model: {model_name} with quantization")
 
119
  with torch.no_grad():
120
  generated_ids = model.generate(
121
  **model_inputs, # Pass tokenized inputs
122
+ max_new_tokens=768,
123
  min_new_tokens=256,
124
  do_sample=True,
125
  temperature=0.7,