DesiredName commited on
Commit
f394a62
·
verified ·
1 Parent(s): c79d601

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -57
app.py CHANGED
@@ -1,62 +1,11 @@
1
- from transformers import AutoTokenizer
2
- from exllamav2 import (
3
- ExLlamaV2,
4
- ExLlamaV2Config,
5
- ExLlamaV2Cache_CPU,
6
- ExLlamaV2Tokenizer
7
- )
8
- from exllamav2.generator import (
9
- ExLlamaV2StreamingGenerator,
10
- ExLlamaV2Sampler
11
- )
12
- import torch
13
- import os
14
 
15
- # disable CUDA
16
- os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Disables GPU detection
17
- os.environ["EXLLAMA_NO_CUDA"] = "1" # Forces CPU mode in ExLlamaV2
18
-
19
- # Configure model
20
- model_dir = "TheBloke_Wizard-Vicuna-13B-GPTQ" # Path to downloaded model
21
- config = ExLlamaV2Config()
22
- config.model_dir = model_dir
23
- config.device_map = "cpu"
24
- config.no_flash_attn = True # Disable flash attention
25
- config.prepare()
26
-
27
- # Load model
28
- model = ExLlamaV2(config)
29
- cache = ExLlamaV2Cache_CPU(model)
30
- model.load_autosplit(cache)
31
-
32
- # Load tokenizer (HF-compatible)
33
- tokenizer = AutoTokenizer.from_pretrained(model_dir)
34
-
35
- def generate_response(prompt, max_tokens=200, temperature=0.7):
36
- # Initialize generator
37
- generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
38
- generator.set_stop_conditions([tokenizer.eos_token_id])
39
-
40
- # Configure sampler
41
- settings = ExLlamaV2Sampler.Settings()
42
- settings.temperature = temperature
43
- settings.top_k = 50
44
- settings.top_p = 0.8
45
 
46
- # Encode prompt
47
- input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
48
-
49
- # Generate
50
- output = generator.generate_simple(
51
- input_ids,
52
- settings,
53
- max_tokens,
54
- seed=42
55
- )
56
-
57
- return tokenizer.decode(output[0], skip_special_tokens=True)
58
-
59
-
60
 
61
  ##############################################
62
 
 
1
+ from gptqmodel import GPTQModel
2
+ # load Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4 from modelscope
3
+ model = GPTQModel.load("TheBloke/Wizard-Vicuna-13B-Uncensored-SuperHOT-8K-GPTQ")
 
 
 
 
 
 
 
 
 
 
4
 
5
+ async def generate_response(input: str):
6
+ result = model.generate(input)[0]
7
+ return model.tokenizer.decode(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  ##############################################
11