Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
@@ -14,6 +14,7 @@ pip install torch==2.0.0+cu117 torchvision==0.15.1+cu117 torchaudio==2.0.1 --ind
|
|
14 |
pip install transformers==4.31.0
|
15 |
pip install accelerate
|
16 |
pip install auto-gptq # for gptq
|
|
|
17 |
```
|
18 |
|
19 |
How to use pruned models
|
@@ -60,16 +61,18 @@ How to use gptq models
|
|
60 |
```python
|
61 |
from transformers import AutoTokenizer
|
62 |
from auto_gptq import AutoGPTQForCausalLM
|
|
|
|
|
63 |
model_path = 'vita-group/vicuna-7b-v1.3_gptq'
|
64 |
tokenizer_path = 'lmsys/vicuna-7b-v1.3'
|
65 |
-
revision = '2bit_128g'
|
66 |
model = AutoGPTQForCausalLM.from_quantized(
|
67 |
model_path,
|
68 |
# inject_fused_attention=False, # or
|
69 |
disable_exllama=True,
|
70 |
device_map='auto',
|
71 |
-
revision=
|
72 |
)
|
|
|
73 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
|
74 |
input_ids = tokenizer('Hello! I am a VITA-compressed-LLM chatbot!', return_tensors='pt').input_ids.to('cuda')
|
75 |
outputs = model.generate(input_ids=input_ids, max_length=128)
|
|
|
14 |
pip install transformers==4.31.0
|
15 |
pip install accelerate
|
16 |
pip install auto-gptq # for gptq
|
17 |
+
pip install sentencepiece
|
18 |
```
|
19 |
|
20 |
How to use pruned models
|
|
|
61 |
```python
|
62 |
from transformers import AutoTokenizer
|
63 |
from auto_gptq import AutoGPTQForCausalLM
|
64 |
+
# model_path = 'vita-group/llama-2-7b_wanda_2_4_gptq_4bit_128g'
|
65 |
+
# tokenizer_path = 'meta-llama/Llama-2-7b-hf'
|
66 |
model_path = 'vita-group/vicuna-7b-v1.3_gptq'
|
67 |
tokenizer_path = 'lmsys/vicuna-7b-v1.3'
|
|
|
68 |
model = AutoGPTQForCausalLM.from_quantized(
|
69 |
model_path,
|
70 |
# inject_fused_attention=False, # or
|
71 |
disable_exllama=True,
|
72 |
device_map='auto',
|
73 |
+
revision='2bit_128g',
|
74 |
)
|
75 |
+
from transformers import AutoTokenizer
|
76 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
|
77 |
input_ids = tokenizer('Hello! I am a VITA-compressed-LLM chatbot!', return_tensors='pt').input_ids.to('cuda')
|
78 |
outputs = model.generate(input_ids=input_ids, max_length=128)
|