llamacpp-madlad400-3b-mt-2jp

Running

App Files Files Community

Akjava commited on Mar 19

Commit

9c2d729

verified ·

1 Parent(s): 4744ae9

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -0

app.py CHANGED Viewed

@@ -60,6 +60,142 @@ description = """Gemma 3 is a family of lightweight, multimodal open models that
 llm = None
 llm_model = None
 def trans(text):
@@ -305,3 +441,4 @@ demo = gr.ChatInterface(
 # Launch the chat interface
 if __name__ == "__main__":
     demo.launch(debug=False)

 llm = None
 llm_model = None
+import ctypes
+import os
+import multiprocessing
+import llama_cpp
+def test():
+    llama_cpp.llama_backend_init(numa=False)
+    N_THREADS = multiprocessing.cpu_count()
+    MODEL_PATH = os.environ.get("MODEL", "/mnt/md0/models/t5-base.gguf")
+    prompt = b"translate English to German: The house is wonderful."
+    lparams = llama_cpp.llama_model_default_params()
+    model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
+    vocab = llama_cpp.llama_model_get_vocab(model)
+    cparams = llama_cpp.llama_context_default_params()
+    cparams.no_perf = False
+    ctx = llama_cpp.llama_init_from_model(model, cparams)
+    sparams = llama_cpp.llama_sampler_chain_default_params()
+    smpl = llama_cpp.llama_sampler_chain_init(sparams)
+    llama_cpp.llama_sampler_chain_add(smpl, llama_cpp.llama_sampler_init_greedy())
+    n_past = 0
+    embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
+    n_of_tok = llama_cpp.llama_tokenize(
+        vocab,
+        prompt,
+        len(prompt),
+        embd_inp,
+        len(embd_inp),
+        True,
+        True,
+    )
+    embd_inp = embd_inp[:n_of_tok]
+    n_ctx = llama_cpp.llama_n_ctx(ctx)
+    n_predict = 20
+    n_predict = min(n_predict, n_ctx - len(embd_inp))
+    input_consumed = 0
+    input_noecho = False
+    remaining_tokens = n_predict
+    embd = []
+    last_n_size = 64
+    last_n_tokens_data = [0] * last_n_size
+    n_batch = 24
+    last_n_repeat = 64
+    repeat_penalty = 1
+    frequency_penalty = 0.0
+    presence_penalty = 0.0
+    batch = llama_cpp.llama_batch_init(n_batch, 0, 1)
+    # prepare batch for encoding containing the prompt
+    batch.n_tokens = len(embd_inp)
+    for i in range(batch.n_tokens):
+        batch.token[i] = embd_inp[i]
+        batch.pos[i] = i
+        batch.n_seq_id[i] = 1
+        batch.seq_id[i][0] = 0
+        batch.logits[i] = False
+    llama_cpp.llama_encode(
+        ctx,
+        batch
+    )
+    # now overwrite embd_inp so batch for decoding will initially contain only
+    # a single token with id acquired from llama_model_decoder_start_token(model)
+    embd_inp = [llama_cpp.llama_model_decoder_start_token(model)]
+    while remaining_tokens > 0:
+        if len(embd) > 0:
+            batch.n_tokens = len(embd)
+            for i in range(batch.n_tokens):
+                batch.token[i] = embd[i]
+                batch.pos[i] = n_past + i
+                batch.n_seq_id[i] = 1
+                batch.seq_id[i][0] = 0
+                batch.logits[i] = i == batch.n_tokens - 1
+            llama_cpp.llama_decode(
+                ctx,
+                batch
+            )
+        n_past += len(embd)
+        embd = []
+        if len(embd_inp) <= input_consumed:
+            id = llama_cpp.llama_sampler_sample(smpl, ctx, -1)
+            last_n_tokens_data = last_n_tokens_data[1:] + [id]
+            embd.append(id)
+            input_noecho = False
+            remaining_tokens -= 1
+        else:
+            while len(embd_inp) > input_consumed:
+                embd.append(embd_inp[input_consumed])
+                last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
+                input_consumed += 1
+                if len(embd) >= n_batch:
+                    break
+        if not input_noecho:
+            for id in embd:
+                size = 32
+                buffer = (ctypes.c_char * size)()
+                n = llama_cpp.llama_token_to_piece(
+                    vocab, llama_cpp.llama_token(id), buffer, size, 0, True
+                )
+                assert n <= size
+                print(
+                    buffer[:n].decode("utf-8"),
+                    end="",
+                    flush=True,
+                )
+        if len(embd) > 0 and embd[-1] in [llama_cpp.llama_token_eos(vocab), llama_cpp.llama_token_eot(vocab)]:
+            break
+    print()
 def trans(text):
 # Launch the chat interface
 if __name__ == "__main__":
     demo.launch(debug=False)
+    test()