added flash_attention
Browse files- app.py +3 -1
- requirements.txt +1 -0
app.py
CHANGED
@@ -18,7 +18,9 @@ processor = AutoProcessor.from_pretrained(model_id)
|
|
18 |
model = LlavaForConditionalGeneration.from_pretrained(
|
19 |
model_id,
|
20 |
quantization_config=quantization_config,
|
21 |
-
device_map="auto"
|
|
|
|
|
22 |
)
|
23 |
|
24 |
|
|
|
18 |
model = LlavaForConditionalGeneration.from_pretrained(
|
19 |
model_id,
|
20 |
quantization_config=quantization_config,
|
21 |
+
device_map="auto",
|
22 |
+
use_flash_attention_2=True,
|
23 |
+
low_cpu_mem_usage=True
|
24 |
)
|
25 |
|
26 |
|
requirements.txt
CHANGED
@@ -150,3 +150,4 @@ webencodings==0.5.1
|
|
150 |
websocket-client==1.7.0
|
151 |
websockets==11.0.3
|
152 |
widgetsnbextension==4.0.9
|
|
|
|
150 |
websocket-client==1.7.0
|
151 |
websockets==11.0.3
|
152 |
widgetsnbextension==4.0.9
|
153 |
+
sentence_transformers
|