Art3B-chat

Running on Zero

freeCS-dot-org commited on Nov 25, 2024

Commit

ce5fb34

verified ·

1 Parent(s): a215ba5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL = "AGI-0/Artificium-llama3.1-8B-001"
-TITLE = """<h2>Link to the model: <a href="https://huggingface.co/AGI-0/Artificium-llama3.1-8B-001" title="Visit the model repository on Hugging Face">AGI-0/Artificium-llama3.1-8B-001</a> please leave a like to the repository if you liked it.</h2>"""
 PLACEHOLDER = """
 <center>
@@ -36,6 +36,7 @@ device = "cuda" # for GPU usage or "cpu" for CPU usage
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL,
     torch_dtype=torch.bfloat16,
     device_map="auto")

 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 MODEL = "AGI-0/Artificium-llama3.1-8B-001"
+TITLE = """<h2>Link to the model: <a href="https://huggingface.co/AGI-0/Artificium-llama3.1-8B-001" title="Visit the model repository on Hugging Face">AGI-0/Artificium-llama3.1-8B-001</a> please leave a like to the repository if you liked it. THIS INFERENCE IS 4-Bit Quantized</h2>"""
 PLACEHOLDER = """
 <center>
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL,
+    load_in_4bit=True,
     torch_dtype=torch.bfloat16,
     device_map="auto")