DesiredName commited on
Commit
90b161e
·
verified ·
1 Parent(s): 0897499

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -38
app.py CHANGED
@@ -1,54 +1,29 @@
1
  from fastapi import FastAPI
2
  import uvicorn
3
 
4
- model_name = "TheBloke/Luna-AI-Llama2-Uncensored-GPTQ"
5
 
6
- from transformers import AutoModel, AutoTokenizer, TextStreamer, BitsAndBytesConfig
7
  import torch
8
 
9
- bnb_config = BitsAndBytesConfig(
10
- load_in_8bit=True,
11
- llm_int8_enable_fp32_cpu_offload=True,
12
- )
13
-
14
  # Load model and tokenizer
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
  model = AutoModel.from_pretrained(
17
  model_name,
18
- quantization_config=bnb_config,
19
- device_map="cpu",
20
- trust_remote_code=True # Required for Llama 2
21
  )
22
 
23
- # Set chat template (critical for chat models)
24
- tokenizer.chat_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content + ' ' + eos_token }}{% endif %}{% endfor %}"
25
-
26
- def llama2_chat(prompt, system_prompt="You are a helpful assistant."):
27
- # Format as Llama 2 chat
28
- messages = [
29
- {"role": "system", "content": system_prompt},
30
- {"role": "user", "content": prompt}
31
- ]
32
-
33
- # Tokenize with chat template
34
- inputs = tokenizer.apply_chat_template(
35
- messages,
36
- return_tensors="pt"
37
- ).to(model.device)
38
-
39
- # Stream output tokens
40
- streamer = TextStreamer(tokenizer, skip_prompt=True)
41
-
42
- # Generate response
43
- outputs = model.generate(
44
- inputs,
45
- max_new_tokens=1000,
46
- temperature=0.7,
47
- streamer=streamer
48
  )
49
-
50
- # Decode full output
51
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
52
 
53
 
54
 
 
1
  from fastapi import FastAPI
2
  import uvicorn
3
 
4
+ model_name = "Llama-3.2-4X3B-MOE-Hell-California-Uncensored-10B-GGUF"
5
 
6
+ from transformers import AutoModel, AutoTokenizer, TextStreamer
7
  import torch
8
 
 
 
 
 
 
9
  # Load model and tokenizer
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
  model = AutoModel.from_pretrained(
12
  model_name,
13
+ device_map="auto",
14
+ trust_remote_code=True
 
15
  )
16
 
17
+ def llama2_chat(prompt):
18
+ inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
19
+ output = model.generate(
20
+ input_ids=inputs["input_ids"],
21
+ attention_mask=inputs["attention_mask"], # Pass attention_mask!
22
+ max_new_tokens=100,
23
+ temperature=0.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  )
25
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
26
+ return response
 
27
 
28
 
29