DesiredName commited on
Commit
e53d910
·
verified ·
1 Parent(s): 55c2416

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -24
app.py CHANGED
@@ -1,35 +1,60 @@
1
- from fastapi import FastAPI
2
- import uvicorn
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
 
 
 
 
 
 
4
 
5
- # Load model and tokenizer
 
 
 
 
6
 
7
- model_id = "DavidAU/Llama-3.2-4X3B-MOE-Hell-California-Uncensored-10B-GGUF"
8
- #filename = "Llama-3.2-4X3B-MOE-Hell-California-10B-D_AU-Q3_k_s.gguf"
9
- filename = "Llama-3.2-4X3B-MOE-Hell-California-10B-D_AU-Q2_k.gguf"
 
10
 
11
- tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
12
- tokenizer.pad_token = tokenizer.eos_token
13
 
14
- model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
15
-
16
- def llama2_chat(prompt):
17
- inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
18
- output = model.generate(
19
- input_ids=inputs["input_ids"],
20
- attention_mask=inputs["attention_mask"], # Pass attention_mask!
21
- max_new_tokens=100,
22
- max_length=30,
23
- temperature=0.3
 
 
 
 
 
 
 
 
 
 
24
  )
25
- response = tokenizer.decode(output[0], skip_special_tokens=True)
26
- return response
27
-
28
-
29
 
30
 
31
 
 
32
 
 
 
33
 
34
  app = FastAPI()
35
 
@@ -39,7 +64,7 @@ def greet_json():
39
 
40
  @app.get("/message")
41
  async def message(input: str):
42
- return llama2_chat(input)
43
 
44
  if __name__ == "__main__":
45
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ from transformers import AutoTokenizer
2
+ from exllamav2 import (
3
+ ExLlamaV2,
4
+ ExLlamaV2Config,
5
+ ExLlamaV2Cache,
6
+ ExLlamaV2Tokenizer
7
+ )
8
+ from exllamav2.generator import (
9
+ ExLlamaV2StreamingGenerator,
10
+ ExLlamaV2Sampler
11
+ )
12
+ import torch
13
 
14
+ # Configure model
15
+ model_dir = "TheBloke_Wizard-Vicuna-13B-GPTQ" # Path to downloaded model
16
+ config = ExLlamaV2Config()
17
+ config.model_dir = model_dir
18
+ config.prepare()
19
 
20
+ # Load model
21
+ model = ExLlamaV2(config)
22
+ cache = ExLlamaV2Cache(model)
23
+ model.load_autosplit(cache)
24
 
25
+ # Load tokenizer (HF-compatible)
26
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
27
 
28
+ def generate_response(prompt, max_tokens=200, temperature=0.7):
29
+ # Initialize generator
30
+ generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
31
+ generator.set_stop_conditions([tokenizer.eos_token_id])
32
+
33
+ # Configure sampler
34
+ settings = ExLlamaV2Sampler.Settings()
35
+ settings.temperature = temperature
36
+ settings.top_k = 50
37
+ settings.top_p = 0.8
38
+
39
+ # Encode prompt
40
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
41
+
42
+ # Generate
43
+ output = generator.generate_simple(
44
+ input_ids,
45
+ settings,
46
+ max_tokens,
47
+ seed=42
48
  )
49
+
50
+ return tokenizer.decode(output[0], skip_special_tokens=True)
 
 
51
 
52
 
53
 
54
+ ##############################################
55
 
56
+ from fastapi import FastAPI
57
+ import uvicorn
58
 
59
  app = FastAPI()
60
 
 
64
 
65
  @app.get("/message")
66
  async def message(input: str):
67
+ return generate_response(input)
68
 
69
  if __name__ == "__main__":
70
  uvicorn.run(app, host="0.0.0.0", port=7860)