jyhong836 commited on
Commit
17895d8
·
1 Parent(s): 886ca35

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +5 -2
README.md CHANGED
@@ -14,6 +14,7 @@ pip install torch==2.0.0+cu117 torchvision==0.15.1+cu117 torchaudio==2.0.1 --ind
14
  pip install transformers==4.31.0
15
  pip install accelerate
16
  pip install auto-gptq # for gptq
 
17
  ```
18
 
19
  How to use pruned models
@@ -60,16 +61,18 @@ How to use gptq models
60
  ```python
61
  from transformers import AutoTokenizer
62
  from auto_gptq import AutoGPTQForCausalLM
 
 
63
  model_path = 'vita-group/vicuna-7b-v1.3_gptq'
64
  tokenizer_path = 'lmsys/vicuna-7b-v1.3'
65
- revision = '2bit_128g'
66
  model = AutoGPTQForCausalLM.from_quantized(
67
  model_path,
68
  # inject_fused_attention=False, # or
69
  disable_exllama=True,
70
  device_map='auto',
71
- revision=revision,
72
  )
 
73
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
74
  input_ids = tokenizer('Hello! I am a VITA-compressed-LLM chatbot!', return_tensors='pt').input_ids.to('cuda')
75
  outputs = model.generate(input_ids=input_ids, max_length=128)
 
14
  pip install transformers==4.31.0
15
  pip install accelerate
16
  pip install auto-gptq # for gptq
17
+ pip install sentencepiece
18
  ```
19
 
20
  How to use pruned models
 
61
  ```python
62
  from transformers import AutoTokenizer
63
  from auto_gptq import AutoGPTQForCausalLM
64
+ # model_path = 'vita-group/llama-2-7b_wanda_2_4_gptq_4bit_128g'
65
+ # tokenizer_path = 'meta-llama/Llama-2-7b-hf'
66
  model_path = 'vita-group/vicuna-7b-v1.3_gptq'
67
  tokenizer_path = 'lmsys/vicuna-7b-v1.3'
 
68
  model = AutoGPTQForCausalLM.from_quantized(
69
  model_path,
70
  # inject_fused_attention=False, # or
71
  disable_exllama=True,
72
  device_map='auto',
73
+ revision='2bit_128g',
74
  )
75
+ from transformers import AutoTokenizer
76
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
77
  input_ids = tokenizer('Hello! I am a VITA-compressed-LLM chatbot!', return_tensors='pt').input_ids.to('cuda')
78
  outputs = model.generate(input_ids=input_ids, max_length=128)