File size: 30,715 Bytes
e7a5765
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
rkonan@rkonan-ThinkPad-T460:~$ OLLAMA_DEBUG=1 ollama serve
time=2025-08-09T22:41:31.741+02:00 level=INFO source=routes.go:1304 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:DEBUG OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/rkonan/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
time=2025-08-09T22:41:31.743+02:00 level=INFO source=images.go:477 msg="total blobs: 9"
time=2025-08-09T22:41:31.743+02:00 level=INFO source=images.go:484 msg="total unused blobs removed: 0"
time=2025-08-09T22:41:31.743+02:00 level=INFO source=routes.go:1357 msg="Listening on 127.0.0.1:11434 (version 0.11.4)"
time=2025-08-09T22:41:31.744+02:00 level=DEBUG source=sched.go:106 msg="starting llm scheduler"
time=2025-08-09T22:41:31.744+02:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:98 msg="searching for GPU discovery libraries for NVIDIA"
time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcuda.so*
time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcuda.so* /home/rkonan/libcuda.so* /usr/local/cuda*/targets/*/lib/libcuda.so* /usr/lib/*-linux-gnu/nvidia/current/libcuda.so* /usr/lib/*-linux-gnu/libcuda.so* /usr/lib/wsl/lib/libcuda.so* /usr/lib/wsl/drivers/*/libcuda.so* /opt/cuda/lib*/libcuda.so* /usr/local/cuda/lib*/libcuda.so* /usr/lib*/libcuda.so* /usr/local/lib*/libcuda.so*]"
time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[]
time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcudart.so*
time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcudart.so* /home/rkonan/libcudart.so* /usr/local/lib/ollama/cuda_v*/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/x86_64-linux-gnu/libcudart.so* /usr/lib/wsl/lib/libcudart.so* /usr/lib/wsl/drivers/*/libcudart.so* /opt/cuda/lib64/libcudart.so* /usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so* /usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/aarch64-linux-gnu/libcudart.so* /usr/local/cuda/lib*/libcudart.so* /usr/lib*/libcudart.so* /usr/local/lib*/libcudart.so*]"
time=2025-08-09T22:41:31.753+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[/usr/local/lib/ollama/libcudart.so.12.8.90]
cudaSetDevice err: 35
time=2025-08-09T22:41:31.754+02:00 level=DEBUG source=gpu.go:574 msg="Unable to load cudart library /usr/local/lib/ollama/libcudart.so.12.8.90: your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama"
time=2025-08-09T22:41:31.754+02:00 level=DEBUG source=amd_linux.go:419 msg="amdgpu driver not detected /sys/module/amdgpu"
time=2025-08-09T22:41:31.754+02:00 level=INFO source=gpu.go:377 msg="no compatible GPUs were discovered"
time=2025-08-09T22:41:31.754+02:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=cpu variant="" compute="" driver=0.0 name="" total="15.5 GiB" available="11.6 GiB"
time=2025-08-09T22:41:31.754+02:00 level=INFO source=routes.go:1398 msg="entering low vram mode" "total vram"="15.5 GiB" threshold="20.0 GiB"
[GIN] 2025/08/09 - 22:41:51 | 200 |        96.9µs |       127.0.0.1 | HEAD     "/"
time=2025-08-09T22:41:51.222+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
[GIN] 2025/08/09 - 22:41:51 | 200 |  110.417215ms |       127.0.0.1 | POST     "/api/show"
time=2025-08-09T22:41:51.296+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.6 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="11.6 GiB" now.free_swap="2.1 GiB"
time=2025-08-09T22:41:51.296+02:00 level=DEBUG source=sched.go:183 msg="updating default concurrency" OLLAMA_MAX_LOADED_MODELS=3 gpu_count=1
time=2025-08-09T22:41:51.319+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32
time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=sched.go:213 msg="cpu mode with first model, loading"
time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.6 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="11.6 GiB" now.free_swap="2.1 GiB"
time=2025-08-09T22:41:51.380+02:00 level=INFO source=server.go:135 msg="system memory" total="15.5 GiB" free="11.6 GiB" free_swap="2.1 GiB"
time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=memory.go:111 msg=evaluating library=cpu gpu_count=1 available="[11.6 GiB]"
time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.vision.block_count default=0
time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.key_length default=128
time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.value_length default=128
time=2025-08-09T22:41:51.381+02:00 level=INFO source=server.go:175 msg=offload library=cpu layers.requested=-1 layers.model=37 layers.offload=0 layers.split="" memory.available="[11.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="2.3 GiB" memory.required.partial="0 B" memory.required.kv="144.0 MiB" memory.required.allocations="[2.3 GiB]" memory.weights.total="1.8 GiB" memory.weights.repeating="1.6 GiB" memory.weights.nonrepeating="243.4 MiB" memory.graph.full="300.8 MiB" memory.graph.partial="544.2 MiB"
time=2025-08-09T22:41:51.381+02:00 level=DEBUG source=server.go:291 msg="compatible gpu libraries" compatible=[]
llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
llama_model_loader: - kv   5:                         general.size_label str              = 3B
llama_model_loader: - kv   6:                            general.license str              = other
llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
llama_model_loader: - kv  23:                          general.file_type u32              = 15
llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
llama_model_loader: - kv  34:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:  181 tensors
llama_model_loader: - type q4_K:  216 tensors
llama_model_loader: - type q6_K:   37 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type   = Q4_K - Medium
print_info: file size   = 1.79 GiB (4.99 BPW) 
init_tokenizer: initializing tokenizer for type 2
load: control token: 151660 '<|fim_middle|>' is not marked as EOG
load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
load: control token: 151653 '<|vision_end|>' is not marked as EOG
load: control token: 151648 '<|box_start|>' is not marked as EOG
load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
load: control token: 151649 '<|box_end|>' is not marked as EOG
load: control token: 151655 '<|image_pad|>' is not marked as EOG
load: control token: 151651 '<|quad_end|>' is not marked as EOG
load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
load: control token: 151652 '<|vision_start|>' is not marked as EOG
load: control token: 151654 '<|vision_pad|>' is not marked as EOG
load: control token: 151656 '<|video_pad|>' is not marked as EOG
load: control token: 151644 '<|im_start|>' is not marked as EOG
load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
load: control token: 151650 '<|quad_start|>' is not marked as EOG
load: special tokens cache size = 22
load: token to piece cache size = 0.9310 MB
print_info: arch             = qwen2
print_info: vocab_only       = 1
print_info: model type       = ?B
print_info: model params     = 3.09 B
print_info: general.name     = Qwen2.5 3B Instruct
print_info: vocab type       = BPE
print_info: n_vocab          = 151936
print_info: n_merges         = 151387
print_info: BOS token        = 151643 '<|endoftext|>'
print_info: EOS token        = 151645 '<|im_end|>'
print_info: EOT token        = 151645 '<|im_end|>'
print_info: PAD token        = 151643 '<|endoftext|>'
print_info: LF token         = 198 'Ċ'
print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
print_info: FIM MID token    = 151660 '<|fim_middle|>'
print_info: FIM PAD token    = 151662 '<|fim_pad|>'
print_info: FIM REP token    = 151663 '<|repo_name|>'
print_info: FIM SEP token    = 151664 '<|file_sep|>'
print_info: EOG token        = 151643 '<|endoftext|>'
print_info: EOG token        = 151645 '<|im_end|>'
print_info: EOG token        = 151662 '<|fim_pad|>'
print_info: EOG token        = 151663 '<|repo_name|>'
print_info: EOG token        = 151664 '<|file_sep|>'
print_info: max token length = 256
llama_model_load: vocab only - skipping tensors
time=2025-08-09T22:41:51.857+02:00 level=DEBUG source=gpu.go:695 msg="no filter required for library cpu"
time=2025-08-09T22:41:51.857+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="/usr/local/bin/ollama runner --model /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 --ctx-size 4096 --batch-size 512 --threads 2 --no-mmap --parallel 1 --port 42013"
time=2025-08-09T22:41:51.857+02:00 level=DEBUG source=server.go:439 msg=subprocess OLLAMA_DEBUG=1 PATH=/home/rkonan/miniconda3/bin:/home/rkonan/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin:/home/rkonan/.local/bin:/home/rkonan/.local/bin OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_LIBRARY_PATH=/usr/local/lib/ollama LD_LIBRARY_PATH=/usr/local/lib/ollama:/usr/local/lib/ollama
time=2025-08-09T22:41:51.857+02:00 level=INFO source=sched.go:481 msg="loaded runners" count=1
time=2025-08-09T22:41:51.858+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
time=2025-08-09T22:41:51.858+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server not responding"
time=2025-08-09T22:41:51.877+02:00 level=INFO source=runner.go:815 msg="starting go runner"
time=2025-08-09T22:41:51.878+02:00 level=DEBUG source=ggml.go:94 msg="ggml backend load all from path" path=/usr/local/lib/ollama
load_backend: loaded CPU backend from /usr/local/lib/ollama/libggml-cpu-haswell.so
time=2025-08-09T22:41:51.892+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 compiler=cgo(gcc)
time=2025-08-09T22:41:51.892+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:42013"
llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 3B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Qwen2.5
llama_model_loader: - kv   5:                         general.size_label str              = 3B
llama_model_loader: - kv   6:                            general.license str              = other
llama_model_loader: - kv   7:                       general.license.name str              = qwen-research
llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen2.5-3...
llama_model_loader: - kv   9:                   general.base_model.count u32              = 1
llama_model_loader: - kv  10:                  general.base_model.0.name str              = Qwen2.5 3B
llama_model_loader: - kv  11:          general.base_model.0.organization str              = Qwen
llama_model_loader: - kv  12:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-3B
llama_model_loader: - kv  13:                               general.tags arr[str,2]       = ["chat", "text-generation"]
llama_model_loader: - kv  14:                          general.languages arr[str,1]       = ["en"]
llama_model_loader: - kv  15:                          qwen2.block_count u32              = 36
llama_model_loader: - kv  16:                       qwen2.context_length u32              = 32768
llama_model_loader: - kv  17:                     qwen2.embedding_length u32              = 2048
llama_model_loader: - kv  18:                  qwen2.feed_forward_length u32              = 11008
llama_model_loader: - kv  19:                 qwen2.attention.head_count u32              = 16
llama_model_loader: - kv  20:              qwen2.attention.head_count_kv u32              = 2
llama_model_loader: - kv  21:                       qwen2.rope.freq_base f32              = 1000000.000000
llama_model_loader: - kv  22:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
llama_model_loader: - kv  23:                          general.file_type u32              = 15
llama_model_loader: - kv  24:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  25:                         tokenizer.ggml.pre str              = qwen2
llama_model_loader: - kv  26:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  28:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 151645
llama_model_loader: - kv  30:            tokenizer.ggml.padding_token_id u32              = 151643
llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 151643
llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = false
llama_model_loader: - kv  33:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
llama_model_loader: - kv  34:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:  181 tensors
llama_model_loader: - type q4_K:  216 tensors
llama_model_loader: - type q6_K:   37 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type   = Q4_K - Medium
print_info: file size   = 1.79 GiB (4.99 BPW) 
time=2025-08-09T22:41:52.110+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
init_tokenizer: initializing tokenizer for type 2
load: control token: 151660 '<|fim_middle|>' is not marked as EOG
load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
load: control token: 151653 '<|vision_end|>' is not marked as EOG
load: control token: 151648 '<|box_start|>' is not marked as EOG
load: control token: 151646 '<|object_ref_start|>' is not marked as EOG
load: control token: 151649 '<|box_end|>' is not marked as EOG
load: control token: 151655 '<|image_pad|>' is not marked as EOG
load: control token: 151651 '<|quad_end|>' is not marked as EOG
load: control token: 151647 '<|object_ref_end|>' is not marked as EOG
load: control token: 151652 '<|vision_start|>' is not marked as EOG
load: control token: 151654 '<|vision_pad|>' is not marked as EOG
load: control token: 151656 '<|video_pad|>' is not marked as EOG
load: control token: 151644 '<|im_start|>' is not marked as EOG
load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
load: control token: 151650 '<|quad_start|>' is not marked as EOG
load: special tokens cache size = 22
load: token to piece cache size = 0.9310 MB
print_info: arch             = qwen2
print_info: vocab_only       = 0
print_info: n_ctx_train      = 32768
print_info: n_embd           = 2048
print_info: n_layer          = 36
print_info: n_head           = 16
print_info: n_head_kv        = 2
print_info: n_rot            = 128
print_info: n_swa            = 0
print_info: n_swa_pattern    = 1
print_info: n_embd_head_k    = 128
print_info: n_embd_head_v    = 128
print_info: n_gqa            = 8
print_info: n_embd_k_gqa     = 256
print_info: n_embd_v_gqa     = 256
print_info: f_norm_eps       = 0.0e+00
print_info: f_norm_rms_eps   = 1.0e-06
print_info: f_clamp_kqv      = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale    = 0.0e+00
print_info: f_attn_scale     = 0.0e+00
print_info: n_ff             = 11008
print_info: n_expert         = 0
print_info: n_expert_used    = 0
print_info: causal attn      = 1
print_info: pooling type     = -1
print_info: rope type        = 2
print_info: rope scaling     = linear
print_info: freq_base_train  = 1000000.0
print_info: freq_scale_train = 1
print_info: n_ctx_orig_yarn  = 32768
print_info: rope_finetuned   = unknown
print_info: ssm_d_conv       = 0
print_info: ssm_d_inner      = 0
print_info: ssm_d_state      = 0
print_info: ssm_dt_rank      = 0
print_info: ssm_dt_b_c_rms   = 0
print_info: model type       = 3B
print_info: model params     = 3.09 B
print_info: general.name     = Qwen2.5 3B Instruct
print_info: vocab type       = BPE
print_info: n_vocab          = 151936
print_info: n_merges         = 151387
print_info: BOS token        = 151643 '<|endoftext|>'
print_info: EOS token        = 151645 '<|im_end|>'
print_info: EOT token        = 151645 '<|im_end|>'
print_info: PAD token        = 151643 '<|endoftext|>'
print_info: LF token         = 198 'Ċ'
print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
print_info: FIM MID token    = 151660 '<|fim_middle|>'
print_info: FIM PAD token    = 151662 '<|fim_pad|>'
print_info: FIM REP token    = 151663 '<|repo_name|>'
print_info: FIM SEP token    = 151664 '<|file_sep|>'
print_info: EOG token        = 151643 '<|endoftext|>'
print_info: EOG token        = 151645 '<|im_end|>'
print_info: EOG token        = 151662 '<|fim_pad|>'
print_info: EOG token        = 151663 '<|repo_name|>'
print_info: EOG token        = 151664 '<|file_sep|>'
print_info: max token length = 256
load_tensors: loading model tensors, this can take a while... (mmap = false)
load_tensors: layer   0 assigned to device CPU, is_swa = 0
load_tensors: layer   1 assigned to device CPU, is_swa = 0
load_tensors: layer   2 assigned to device CPU, is_swa = 0
load_tensors: layer   3 assigned to device CPU, is_swa = 0
load_tensors: layer   4 assigned to device CPU, is_swa = 0
load_tensors: layer   5 assigned to device CPU, is_swa = 0
load_tensors: layer   6 assigned to device CPU, is_swa = 0
load_tensors: layer   7 assigned to device CPU, is_swa = 0
load_tensors: layer   8 assigned to device CPU, is_swa = 0
load_tensors: layer   9 assigned to device CPU, is_swa = 0
load_tensors: layer  10 assigned to device CPU, is_swa = 0
load_tensors: layer  11 assigned to device CPU, is_swa = 0
load_tensors: layer  12 assigned to device CPU, is_swa = 0
load_tensors: layer  13 assigned to device CPU, is_swa = 0
load_tensors: layer  14 assigned to device CPU, is_swa = 0
load_tensors: layer  15 assigned to device CPU, is_swa = 0
load_tensors: layer  16 assigned to device CPU, is_swa = 0
load_tensors: layer  17 assigned to device CPU, is_swa = 0
load_tensors: layer  18 assigned to device CPU, is_swa = 0
load_tensors: layer  19 assigned to device CPU, is_swa = 0
load_tensors: layer  20 assigned to device CPU, is_swa = 0
load_tensors: layer  21 assigned to device CPU, is_swa = 0
load_tensors: layer  22 assigned to device CPU, is_swa = 0
load_tensors: layer  23 assigned to device CPU, is_swa = 0
load_tensors: layer  24 assigned to device CPU, is_swa = 0
load_tensors: layer  25 assigned to device CPU, is_swa = 0
load_tensors: layer  26 assigned to device CPU, is_swa = 0
load_tensors: layer  27 assigned to device CPU, is_swa = 0
load_tensors: layer  28 assigned to device CPU, is_swa = 0
load_tensors: layer  29 assigned to device CPU, is_swa = 0
load_tensors: layer  30 assigned to device CPU, is_swa = 0
load_tensors: layer  31 assigned to device CPU, is_swa = 0
load_tensors: layer  32 assigned to device CPU, is_swa = 0
load_tensors: layer  33 assigned to device CPU, is_swa = 0
load_tensors: layer  34 assigned to device CPU, is_swa = 0
load_tensors: layer  35 assigned to device CPU, is_swa = 0
load_tensors: layer  36 assigned to device CPU, is_swa = 0
load_tensors:          CPU model buffer size =  1834.82 MiB
load_all_data: no device found for buffer type CPU for async uploads
time=2025-08-09T22:41:52.865+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.28"
time=2025-08-09T22:41:53.116+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.50"
time=2025-08-09T22:41:53.366+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.72"
time=2025-08-09T22:41:53.617+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.97"
llama_context: constructing llama_context
llama_context: n_seq_max     = 1
llama_context: n_ctx         = 4096
llama_context: n_ctx_per_seq = 4096
llama_context: n_batch       = 512
llama_context: n_ubatch      = 512
llama_context: causal_attn   = 1
llama_context: flash_attn    = 0
llama_context: freq_base     = 1000000.0
llama_context: freq_scale    = 1
llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
set_abort_callback: call
llama_context:        CPU  output buffer size =     0.59 MiB
create_memory: n_ctx = 4096 (padded)
llama_kv_cache_unified: kv_size = 4096, type_k = 'f16', type_v = 'f16', n_layer = 36, can_shift = 1, padding = 32
llama_kv_cache_unified: layer   0: dev = CPU
llama_kv_cache_unified: layer   1: dev = CPU
llama_kv_cache_unified: layer   2: dev = CPU
llama_kv_cache_unified: layer   3: dev = CPU
llama_kv_cache_unified: layer   4: dev = CPU
llama_kv_cache_unified: layer   5: dev = CPU
llama_kv_cache_unified: layer   6: dev = CPU
llama_kv_cache_unified: layer   7: dev = CPU
llama_kv_cache_unified: layer   8: dev = CPU
llama_kv_cache_unified: layer   9: dev = CPU
llama_kv_cache_unified: layer  10: dev = CPU
llama_kv_cache_unified: layer  11: dev = CPU
llama_kv_cache_unified: layer  12: dev = CPU
llama_kv_cache_unified: layer  13: dev = CPU
llama_kv_cache_unified: layer  14: dev = CPU
llama_kv_cache_unified: layer  15: dev = CPU
llama_kv_cache_unified: layer  16: dev = CPU
llama_kv_cache_unified: layer  17: dev = CPU
llama_kv_cache_unified: layer  18: dev = CPU
llama_kv_cache_unified: layer  19: dev = CPU
llama_kv_cache_unified: layer  20: dev = CPU
llama_kv_cache_unified: layer  21: dev = CPU
llama_kv_cache_unified: layer  22: dev = CPU
llama_kv_cache_unified: layer  23: dev = CPU
llama_kv_cache_unified: layer  24: dev = CPU
llama_kv_cache_unified: layer  25: dev = CPU
llama_kv_cache_unified: layer  26: dev = CPU
llama_kv_cache_unified: layer  27: dev = CPU
llama_kv_cache_unified: layer  28: dev = CPU
llama_kv_cache_unified: layer  29: dev = CPU
llama_kv_cache_unified: layer  30: dev = CPU
llama_kv_cache_unified: layer  31: dev = CPU
llama_kv_cache_unified: layer  32: dev = CPU
llama_kv_cache_unified: layer  33: dev = CPU
llama_kv_cache_unified: layer  34: dev = CPU
llama_kv_cache_unified: layer  35: dev = CPU
llama_kv_cache_unified:        CPU KV buffer size =   144.00 MiB
llama_kv_cache_unified: KV self size  =  144.00 MiB, K (f16):   72.00 MiB, V (f16):   72.00 MiB
llama_context: enumerating backends
llama_context: backend_ptrs.size() = 1
llama_context: max_nodes = 65536
llama_context: worst-case: n_tokens = 512, n_seqs = 1, n_outputs = 0
llama_context: reserving graph for n_tokens = 512, n_seqs = 1
llama_context: reserving graph for n_tokens = 1, n_seqs = 1
llama_context: reserving graph for n_tokens = 512, n_seqs = 1
llama_context:        CPU compute buffer size =   300.75 MiB
llama_context: graph nodes  = 1338
llama_context: graph splits = 1
time=2025-08-09T22:41:53.869+02:00 level=INFO source=server.go:637 msg="llama runner started in 2.01 seconds"
time=2025-08-09T22:41:53.869+02:00 level=DEBUG source=sched.go:493 msg="finished setting up" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096
time=2025-08-09T22:41:53.870+02:00 level=DEBUG source=server.go:736 msg="completion request" images=0 prompt=155 format=""
time=2025-08-09T22:41:53.880+02:00 level=DEBUG source=cache.go:104 msg="loading cache slot" id=0 cache=0 prompt=31 used=0 remaining=31
[GIN] 2025/08/09 - 22:41:58 | 200 |  7.284707733s |       127.0.0.1 | POST     "/api/generate"
time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:501 msg="context for request finished"
time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:341 msg="runner with non-zero duration has gone idle, adding timer" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 duration=5m0s
time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:359 msg="after processing request finished event" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 refCount=0