File size: 3,201 Bytes
66acbfe
 
 
 
 
 
 
8083870
66acbfe
5e4faa3
 
 
66acbfe
 
8083870
66acbfe
 
 
 
 
 
 
 
8083870
 
 
 
66acbfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import torch
import transformers
import quant
from typing import Dict, Any
from gptq import GPTQ
from utils import find_layers, DEV
from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
import os

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

class EndpointHandler:
    def __init__(self, 
                 path="",
                 model_name="Wizard-Vicuna-13B-Uncensored-GPTQ", 
                 checkpoint_path="Wizard-Vicuna-13B-Uncensored-GPTQ/Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors",
                 wbits = 4,
                 groupsize=128, 
                 fused_mlp=True, 
                 eval=True, 
                 warmup_autotune=True):
        
        model_name = os.path.join(path, model_name)
        checkpoint_path = os.path.join(path, checkpoint_path)

                     
        self.model = self.load_quant(model_name, checkpoint_path, wbits, groupsize, fused_mlp, eval, warmup_autotune)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        self.model.to(DEV)

    def load_quant(self, model, checkpoint, wbits, groupsize, fused_mlp, eval, warmup_autotune):
        config = LlamaConfig.from_pretrained(model)

        def noop(*args, **kwargs):
            pass

        torch.nn.init.kaiming_uniform_ = noop
        torch.nn.init.uniform_ = noop
        torch.nn.init.normal_ = noop

        torch.set_default_dtype(torch.half)
        transformers.modeling_utils._init_weights = False
        model = LlamaForCausalLM(config)
        torch.set_default_dtype(torch.float)
        if eval:
            model = model.eval()
        layers = find_layers(model)
        for name in ['lm_head']:
            if name in layers:
                del layers[name]
        quant.make_quant_linear(model, layers, wbits, groupsize)

        del layers

        print('Loading model ...')
        if checkpoint.endswith('.safetensors'):
            from safetensors.torch import load_file as safe_load
            model.load_state_dict(safe_load(checkpoint), strict=False)
        else:
            model.load_state_dict(torch.load(checkpoint), strict=False)

        if eval:
            quant.make_quant_attn(model)
            quant.make_quant_norm(model)
            if fused_mlp:
                quant.make_fused_mlp(model)
        if warmup_autotune:
            quant.autotune_warmup_linear(model, transpose=not (eval))
            if eval and fused_mlp:
                quant.autotune_warmup_fused(model)
        model.seqlen = 2048
        print('Done.')

        return model

    def __call__(self, data: Any) -> Dict[str, str]:
        input_text = data.pop("inputs", data)
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt").to(DEV)

        with torch.no_grad():
            generated_ids = self.model.generate(
                input_ids,
                do_sample=True,
                min_length=50,
                max_length=200,
                top_p=0.95,
                temperature=0.8,
            )
        generated_text = self.tokenizer.decode([el.item() for el in generated_ids[0]])

        return {'generated_text': generated_text}