Update app.py
Browse files
app.py
CHANGED
@@ -33,14 +33,11 @@ huggingface_hub.login(token=LLama)
|
|
33 |
MODEL_ID = "meta-llama/Llama-2-7b-hf"
|
34 |
tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
35 |
|
36 |
-
#
|
37 |
-
use_flash_attention = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
|
38 |
-
attn_implementation = "flash_attention_2" if use_flash_attention else "eager" # Default to eager if no compatible GPU
|
39 |
model = LlamaForCausalLM.from_pretrained(
|
40 |
MODEL_ID,
|
41 |
torch_dtype=torch.bfloat16,
|
42 |
device_map="auto",
|
43 |
-
attn_implementation=attn_implementation,
|
44 |
load_in_8bit=True
|
45 |
)
|
46 |
|
|
|
33 |
MODEL_ID = "meta-llama/Llama-2-7b-hf"
|
34 |
tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
35 |
|
36 |
+
# Load model with default attention mechanism (no Flash Attention)
|
|
|
|
|
37 |
model = LlamaForCausalLM.from_pretrained(
|
38 |
MODEL_ID,
|
39 |
torch_dtype=torch.bfloat16,
|
40 |
device_map="auto",
|
|
|
41 |
load_in_8bit=True
|
42 |
)
|
43 |
|