Spaces:
Paused
Paused
File size: 30,715 Bytes
e7a5765 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 |
rkonan@rkonan-ThinkPad-T460:~$ OLLAMA_DEBUG=1 ollama serve time=2025-08-09T22:41:31.741+02:00 level=INFO source=routes.go:1304 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:DEBUG OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/rkonan/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]" time=2025-08-09T22:41:31.743+02:00 level=INFO source=images.go:477 msg="total blobs: 9" time=2025-08-09T22:41:31.743+02:00 level=INFO source=images.go:484 msg="total unused blobs removed: 0" time=2025-08-09T22:41:31.743+02:00 level=INFO source=routes.go:1357 msg="Listening on 127.0.0.1:11434 (version 0.11.4)" time=2025-08-09T22:41:31.744+02:00 level=DEBUG source=sched.go:106 msg="starting llm scheduler" time=2025-08-09T22:41:31.744+02:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs" time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:98 msg="searching for GPU discovery libraries for NVIDIA" time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcuda.so* time=2025-08-09T22:41:31.745+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcuda.so* /home/rkonan/libcuda.so* /usr/local/cuda*/targets/*/lib/libcuda.so* /usr/lib/*-linux-gnu/nvidia/current/libcuda.so* /usr/lib/*-linux-gnu/libcuda.so* /usr/lib/wsl/lib/libcuda.so* /usr/lib/wsl/drivers/*/libcuda.so* /opt/cuda/lib*/libcuda.so* /usr/local/cuda/lib*/libcuda.so* /usr/lib*/libcuda.so* /usr/local/lib*/libcuda.so*]" time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[] time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:501 msg="Searching for GPU library" name=libcudart.so* time=2025-08-09T22:41:31.750+02:00 level=DEBUG source=gpu.go:525 msg="gpu library search" globs="[/usr/local/lib/ollama/libcudart.so* /home/rkonan/libcudart.so* /usr/local/lib/ollama/cuda_v*/libcudart.so* /usr/local/cuda/lib64/libcudart.so* /usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/x86_64-linux-gnu/libcudart.so* /usr/lib/wsl/lib/libcudart.so* /usr/lib/wsl/drivers/*/libcudart.so* /opt/cuda/lib64/libcudart.so* /usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so* /usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so* /usr/lib/aarch64-linux-gnu/libcudart.so* /usr/local/cuda/lib*/libcudart.so* /usr/lib*/libcudart.so* /usr/local/lib*/libcudart.so*]" time=2025-08-09T22:41:31.753+02:00 level=DEBUG source=gpu.go:558 msg="discovered GPU libraries" paths=[/usr/local/lib/ollama/libcudart.so.12.8.90] cudaSetDevice err: 35 time=2025-08-09T22:41:31.754+02:00 level=DEBUG source=gpu.go:574 msg="Unable to load cudart library /usr/local/lib/ollama/libcudart.so.12.8.90: your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama" time=2025-08-09T22:41:31.754+02:00 level=DEBUG source=amd_linux.go:419 msg="amdgpu driver not detected /sys/module/amdgpu" time=2025-08-09T22:41:31.754+02:00 level=INFO source=gpu.go:377 msg="no compatible GPUs were discovered" time=2025-08-09T22:41:31.754+02:00 level=INFO source=types.go:130 msg="inference compute" id=0 library=cpu variant="" compute="" driver=0.0 name="" total="15.5 GiB" available="11.6 GiB" time=2025-08-09T22:41:31.754+02:00 level=INFO source=routes.go:1398 msg="entering low vram mode" "total vram"="15.5 GiB" threshold="20.0 GiB" [GIN] 2025/08/09 - 22:41:51 | 200 | 96.9µs | 127.0.0.1 | HEAD "/" time=2025-08-09T22:41:51.222+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32 [GIN] 2025/08/09 - 22:41:51 | 200 | 110.417215ms | 127.0.0.1 | POST "/api/show" time=2025-08-09T22:41:51.296+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.6 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="11.6 GiB" now.free_swap="2.1 GiB" time=2025-08-09T22:41:51.296+02:00 level=DEBUG source=sched.go:183 msg="updating default concurrency" OLLAMA_MAX_LOADED_MODELS=3 gpu_count=1 time=2025-08-09T22:41:51.319+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=general.alignment default=32 time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=sched.go:213 msg="cpu mode with first model, loading" time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=gpu.go:391 msg="updating system memory data" before.total="15.5 GiB" before.free="11.6 GiB" before.free_swap="2.1 GiB" now.total="15.5 GiB" now.free="11.6 GiB" now.free_swap="2.1 GiB" time=2025-08-09T22:41:51.380+02:00 level=INFO source=server.go:135 msg="system memory" total="15.5 GiB" free="11.6 GiB" free_swap="2.1 GiB" time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=memory.go:111 msg=evaluating library=cpu gpu_count=1 available="[11.6 GiB]" time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.vision.block_count default=0 time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.key_length default=128 time=2025-08-09T22:41:51.380+02:00 level=DEBUG source=ggml.go:208 msg="key with type not found" key=qwen2.attention.value_length default=128 time=2025-08-09T22:41:51.381+02:00 level=INFO source=server.go:175 msg=offload library=cpu layers.requested=-1 layers.model=37 layers.offload=0 layers.split="" memory.available="[11.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="2.3 GiB" memory.required.partial="0 B" memory.required.kv="144.0 MiB" memory.required.allocations="[2.3 GiB]" memory.weights.total="1.8 GiB" memory.weights.repeating="1.6 GiB" memory.weights.nonrepeating="243.4 MiB" memory.graph.full="300.8 MiB" memory.graph.partial="544.2 MiB" time=2025-08-09T22:41:51.381+02:00 level=DEBUG source=server.go:291 msg="compatible gpu libraries" compatible=[] llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = qwen2 llama_model_loader: - kv 1: general.type str = model llama_model_loader: - kv 2: general.name str = Qwen2.5 3B Instruct llama_model_loader: - kv 3: general.finetune str = Instruct llama_model_loader: - kv 4: general.basename str = Qwen2.5 llama_model_loader: - kv 5: general.size_label str = 3B llama_model_loader: - kv 6: general.license str = other llama_model_loader: - kv 7: general.license.name str = qwen-research llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen2.5-3... llama_model_loader: - kv 9: general.base_model.count u32 = 1 llama_model_loader: - kv 10: general.base_model.0.name str = Qwen2.5 3B llama_model_loader: - kv 11: general.base_model.0.organization str = Qwen llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen2.5-3B llama_model_loader: - kv 13: general.tags arr[str,2] = ["chat", "text-generation"] llama_model_loader: - kv 14: general.languages arr[str,1] = ["en"] llama_model_loader: - kv 15: qwen2.block_count u32 = 36 llama_model_loader: - kv 16: qwen2.context_length u32 = 32768 llama_model_loader: - kv 17: qwen2.embedding_length u32 = 2048 llama_model_loader: - kv 18: qwen2.feed_forward_length u32 = 11008 llama_model_loader: - kv 19: qwen2.attention.head_count u32 = 16 llama_model_loader: - kv 20: qwen2.attention.head_count_kv u32 = 2 llama_model_loader: - kv 21: qwen2.rope.freq_base f32 = 1000000.000000 llama_model_loader: - kv 22: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 23: general.file_type u32 = 15 llama_model_loader: - kv 24: tokenizer.ggml.model str = gpt2 llama_model_loader: - kv 25: tokenizer.ggml.pre str = qwen2 llama_model_loader: - kv 26: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... llama_model_loader: - kv 28: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 151645 llama_model_loader: - kv 30: tokenizer.ggml.padding_token_id u32 = 151643 llama_model_loader: - kv 31: tokenizer.ggml.bos_token_id u32 = 151643 llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = false llama_model_loader: - kv 33: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... llama_model_loader: - kv 34: general.quantization_version u32 = 2 llama_model_loader: - type f32: 181 tensors llama_model_loader: - type q4_K: 216 tensors llama_model_loader: - type q6_K: 37 tensors print_info: file format = GGUF V3 (latest) print_info: file type = Q4_K - Medium print_info: file size = 1.79 GiB (4.99 BPW) init_tokenizer: initializing tokenizer for type 2 load: control token: 151660 '<|fim_middle|>' is not marked as EOG load: control token: 151659 '<|fim_prefix|>' is not marked as EOG load: control token: 151653 '<|vision_end|>' is not marked as EOG load: control token: 151648 '<|box_start|>' is not marked as EOG load: control token: 151646 '<|object_ref_start|>' is not marked as EOG load: control token: 151649 '<|box_end|>' is not marked as EOG load: control token: 151655 '<|image_pad|>' is not marked as EOG load: control token: 151651 '<|quad_end|>' is not marked as EOG load: control token: 151647 '<|object_ref_end|>' is not marked as EOG load: control token: 151652 '<|vision_start|>' is not marked as EOG load: control token: 151654 '<|vision_pad|>' is not marked as EOG load: control token: 151656 '<|video_pad|>' is not marked as EOG load: control token: 151644 '<|im_start|>' is not marked as EOG load: control token: 151661 '<|fim_suffix|>' is not marked as EOG load: control token: 151650 '<|quad_start|>' is not marked as EOG load: special tokens cache size = 22 load: token to piece cache size = 0.9310 MB print_info: arch = qwen2 print_info: vocab_only = 1 print_info: model type = ?B print_info: model params = 3.09 B print_info: general.name = Qwen2.5 3B Instruct print_info: vocab type = BPE print_info: n_vocab = 151936 print_info: n_merges = 151387 print_info: BOS token = 151643 '<|endoftext|>' print_info: EOS token = 151645 '<|im_end|>' print_info: EOT token = 151645 '<|im_end|>' print_info: PAD token = 151643 '<|endoftext|>' print_info: LF token = 198 'Ċ' print_info: FIM PRE token = 151659 '<|fim_prefix|>' print_info: FIM SUF token = 151661 '<|fim_suffix|>' print_info: FIM MID token = 151660 '<|fim_middle|>' print_info: FIM PAD token = 151662 '<|fim_pad|>' print_info: FIM REP token = 151663 '<|repo_name|>' print_info: FIM SEP token = 151664 '<|file_sep|>' print_info: EOG token = 151643 '<|endoftext|>' print_info: EOG token = 151645 '<|im_end|>' print_info: EOG token = 151662 '<|fim_pad|>' print_info: EOG token = 151663 '<|repo_name|>' print_info: EOG token = 151664 '<|file_sep|>' print_info: max token length = 256 llama_model_load: vocab only - skipping tensors time=2025-08-09T22:41:51.857+02:00 level=DEBUG source=gpu.go:695 msg="no filter required for library cpu" time=2025-08-09T22:41:51.857+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="/usr/local/bin/ollama runner --model /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 --ctx-size 4096 --batch-size 512 --threads 2 --no-mmap --parallel 1 --port 42013" time=2025-08-09T22:41:51.857+02:00 level=DEBUG source=server.go:439 msg=subprocess OLLAMA_DEBUG=1 PATH=/home/rkonan/miniconda3/bin:/home/rkonan/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin:/home/rkonan/.local/bin:/home/rkonan/.local/bin OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_LIBRARY_PATH=/usr/local/lib/ollama LD_LIBRARY_PATH=/usr/local/lib/ollama:/usr/local/lib/ollama time=2025-08-09T22:41:51.857+02:00 level=INFO source=sched.go:481 msg="loaded runners" count=1 time=2025-08-09T22:41:51.858+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding" time=2025-08-09T22:41:51.858+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server not responding" time=2025-08-09T22:41:51.877+02:00 level=INFO source=runner.go:815 msg="starting go runner" time=2025-08-09T22:41:51.878+02:00 level=DEBUG source=ggml.go:94 msg="ggml backend load all from path" path=/usr/local/lib/ollama load_backend: loaded CPU backend from /usr/local/lib/ollama/libggml-cpu-haswell.so time=2025-08-09T22:41:51.892+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 compiler=cgo(gcc) time=2025-08-09T22:41:51.892+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:42013" llama_model_loader: loaded meta data with 35 key-value pairs and 434 tensors from /home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = qwen2 llama_model_loader: - kv 1: general.type str = model llama_model_loader: - kv 2: general.name str = Qwen2.5 3B Instruct llama_model_loader: - kv 3: general.finetune str = Instruct llama_model_loader: - kv 4: general.basename str = Qwen2.5 llama_model_loader: - kv 5: general.size_label str = 3B llama_model_loader: - kv 6: general.license str = other llama_model_loader: - kv 7: general.license.name str = qwen-research llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen2.5-3... llama_model_loader: - kv 9: general.base_model.count u32 = 1 llama_model_loader: - kv 10: general.base_model.0.name str = Qwen2.5 3B llama_model_loader: - kv 11: general.base_model.0.organization str = Qwen llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen2.5-3B llama_model_loader: - kv 13: general.tags arr[str,2] = ["chat", "text-generation"] llama_model_loader: - kv 14: general.languages arr[str,1] = ["en"] llama_model_loader: - kv 15: qwen2.block_count u32 = 36 llama_model_loader: - kv 16: qwen2.context_length u32 = 32768 llama_model_loader: - kv 17: qwen2.embedding_length u32 = 2048 llama_model_loader: - kv 18: qwen2.feed_forward_length u32 = 11008 llama_model_loader: - kv 19: qwen2.attention.head_count u32 = 16 llama_model_loader: - kv 20: qwen2.attention.head_count_kv u32 = 2 llama_model_loader: - kv 21: qwen2.rope.freq_base f32 = 1000000.000000 llama_model_loader: - kv 22: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 23: general.file_type u32 = 15 llama_model_loader: - kv 24: tokenizer.ggml.model str = gpt2 llama_model_loader: - kv 25: tokenizer.ggml.pre str = qwen2 llama_model_loader: - kv 26: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... llama_model_loader: - kv 28: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 151645 llama_model_loader: - kv 30: tokenizer.ggml.padding_token_id u32 = 151643 llama_model_loader: - kv 31: tokenizer.ggml.bos_token_id u32 = 151643 llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = false llama_model_loader: - kv 33: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... llama_model_loader: - kv 34: general.quantization_version u32 = 2 llama_model_loader: - type f32: 181 tensors llama_model_loader: - type q4_K: 216 tensors llama_model_loader: - type q6_K: 37 tensors print_info: file format = GGUF V3 (latest) print_info: file type = Q4_K - Medium print_info: file size = 1.79 GiB (4.99 BPW) time=2025-08-09T22:41:52.110+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model" init_tokenizer: initializing tokenizer for type 2 load: control token: 151660 '<|fim_middle|>' is not marked as EOG load: control token: 151659 '<|fim_prefix|>' is not marked as EOG load: control token: 151653 '<|vision_end|>' is not marked as EOG load: control token: 151648 '<|box_start|>' is not marked as EOG load: control token: 151646 '<|object_ref_start|>' is not marked as EOG load: control token: 151649 '<|box_end|>' is not marked as EOG load: control token: 151655 '<|image_pad|>' is not marked as EOG load: control token: 151651 '<|quad_end|>' is not marked as EOG load: control token: 151647 '<|object_ref_end|>' is not marked as EOG load: control token: 151652 '<|vision_start|>' is not marked as EOG load: control token: 151654 '<|vision_pad|>' is not marked as EOG load: control token: 151656 '<|video_pad|>' is not marked as EOG load: control token: 151644 '<|im_start|>' is not marked as EOG load: control token: 151661 '<|fim_suffix|>' is not marked as EOG load: control token: 151650 '<|quad_start|>' is not marked as EOG load: special tokens cache size = 22 load: token to piece cache size = 0.9310 MB print_info: arch = qwen2 print_info: vocab_only = 0 print_info: n_ctx_train = 32768 print_info: n_embd = 2048 print_info: n_layer = 36 print_info: n_head = 16 print_info: n_head_kv = 2 print_info: n_rot = 128 print_info: n_swa = 0 print_info: n_swa_pattern = 1 print_info: n_embd_head_k = 128 print_info: n_embd_head_v = 128 print_info: n_gqa = 8 print_info: n_embd_k_gqa = 256 print_info: n_embd_v_gqa = 256 print_info: f_norm_eps = 0.0e+00 print_info: f_norm_rms_eps = 1.0e-06 print_info: f_clamp_kqv = 0.0e+00 print_info: f_max_alibi_bias = 0.0e+00 print_info: f_logit_scale = 0.0e+00 print_info: f_attn_scale = 0.0e+00 print_info: n_ff = 11008 print_info: n_expert = 0 print_info: n_expert_used = 0 print_info: causal attn = 1 print_info: pooling type = -1 print_info: rope type = 2 print_info: rope scaling = linear print_info: freq_base_train = 1000000.0 print_info: freq_scale_train = 1 print_info: n_ctx_orig_yarn = 32768 print_info: rope_finetuned = unknown print_info: ssm_d_conv = 0 print_info: ssm_d_inner = 0 print_info: ssm_d_state = 0 print_info: ssm_dt_rank = 0 print_info: ssm_dt_b_c_rms = 0 print_info: model type = 3B print_info: model params = 3.09 B print_info: general.name = Qwen2.5 3B Instruct print_info: vocab type = BPE print_info: n_vocab = 151936 print_info: n_merges = 151387 print_info: BOS token = 151643 '<|endoftext|>' print_info: EOS token = 151645 '<|im_end|>' print_info: EOT token = 151645 '<|im_end|>' print_info: PAD token = 151643 '<|endoftext|>' print_info: LF token = 198 'Ċ' print_info: FIM PRE token = 151659 '<|fim_prefix|>' print_info: FIM SUF token = 151661 '<|fim_suffix|>' print_info: FIM MID token = 151660 '<|fim_middle|>' print_info: FIM PAD token = 151662 '<|fim_pad|>' print_info: FIM REP token = 151663 '<|repo_name|>' print_info: FIM SEP token = 151664 '<|file_sep|>' print_info: EOG token = 151643 '<|endoftext|>' print_info: EOG token = 151645 '<|im_end|>' print_info: EOG token = 151662 '<|fim_pad|>' print_info: EOG token = 151663 '<|repo_name|>' print_info: EOG token = 151664 '<|file_sep|>' print_info: max token length = 256 load_tensors: loading model tensors, this can take a while... (mmap = false) load_tensors: layer 0 assigned to device CPU, is_swa = 0 load_tensors: layer 1 assigned to device CPU, is_swa = 0 load_tensors: layer 2 assigned to device CPU, is_swa = 0 load_tensors: layer 3 assigned to device CPU, is_swa = 0 load_tensors: layer 4 assigned to device CPU, is_swa = 0 load_tensors: layer 5 assigned to device CPU, is_swa = 0 load_tensors: layer 6 assigned to device CPU, is_swa = 0 load_tensors: layer 7 assigned to device CPU, is_swa = 0 load_tensors: layer 8 assigned to device CPU, is_swa = 0 load_tensors: layer 9 assigned to device CPU, is_swa = 0 load_tensors: layer 10 assigned to device CPU, is_swa = 0 load_tensors: layer 11 assigned to device CPU, is_swa = 0 load_tensors: layer 12 assigned to device CPU, is_swa = 0 load_tensors: layer 13 assigned to device CPU, is_swa = 0 load_tensors: layer 14 assigned to device CPU, is_swa = 0 load_tensors: layer 15 assigned to device CPU, is_swa = 0 load_tensors: layer 16 assigned to device CPU, is_swa = 0 load_tensors: layer 17 assigned to device CPU, is_swa = 0 load_tensors: layer 18 assigned to device CPU, is_swa = 0 load_tensors: layer 19 assigned to device CPU, is_swa = 0 load_tensors: layer 20 assigned to device CPU, is_swa = 0 load_tensors: layer 21 assigned to device CPU, is_swa = 0 load_tensors: layer 22 assigned to device CPU, is_swa = 0 load_tensors: layer 23 assigned to device CPU, is_swa = 0 load_tensors: layer 24 assigned to device CPU, is_swa = 0 load_tensors: layer 25 assigned to device CPU, is_swa = 0 load_tensors: layer 26 assigned to device CPU, is_swa = 0 load_tensors: layer 27 assigned to device CPU, is_swa = 0 load_tensors: layer 28 assigned to device CPU, is_swa = 0 load_tensors: layer 29 assigned to device CPU, is_swa = 0 load_tensors: layer 30 assigned to device CPU, is_swa = 0 load_tensors: layer 31 assigned to device CPU, is_swa = 0 load_tensors: layer 32 assigned to device CPU, is_swa = 0 load_tensors: layer 33 assigned to device CPU, is_swa = 0 load_tensors: layer 34 assigned to device CPU, is_swa = 0 load_tensors: layer 35 assigned to device CPU, is_swa = 0 load_tensors: layer 36 assigned to device CPU, is_swa = 0 load_tensors: CPU model buffer size = 1834.82 MiB load_all_data: no device found for buffer type CPU for async uploads time=2025-08-09T22:41:52.865+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.28" time=2025-08-09T22:41:53.116+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.50" time=2025-08-09T22:41:53.366+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.72" time=2025-08-09T22:41:53.617+02:00 level=DEBUG source=server.go:643 msg="model load progress 0.97" llama_context: constructing llama_context llama_context: n_seq_max = 1 llama_context: n_ctx = 4096 llama_context: n_ctx_per_seq = 4096 llama_context: n_batch = 512 llama_context: n_ubatch = 512 llama_context: causal_attn = 1 llama_context: flash_attn = 0 llama_context: freq_base = 1000000.0 llama_context: freq_scale = 1 llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized set_abort_callback: call llama_context: CPU output buffer size = 0.59 MiB create_memory: n_ctx = 4096 (padded) llama_kv_cache_unified: kv_size = 4096, type_k = 'f16', type_v = 'f16', n_layer = 36, can_shift = 1, padding = 32 llama_kv_cache_unified: layer 0: dev = CPU llama_kv_cache_unified: layer 1: dev = CPU llama_kv_cache_unified: layer 2: dev = CPU llama_kv_cache_unified: layer 3: dev = CPU llama_kv_cache_unified: layer 4: dev = CPU llama_kv_cache_unified: layer 5: dev = CPU llama_kv_cache_unified: layer 6: dev = CPU llama_kv_cache_unified: layer 7: dev = CPU llama_kv_cache_unified: layer 8: dev = CPU llama_kv_cache_unified: layer 9: dev = CPU llama_kv_cache_unified: layer 10: dev = CPU llama_kv_cache_unified: layer 11: dev = CPU llama_kv_cache_unified: layer 12: dev = CPU llama_kv_cache_unified: layer 13: dev = CPU llama_kv_cache_unified: layer 14: dev = CPU llama_kv_cache_unified: layer 15: dev = CPU llama_kv_cache_unified: layer 16: dev = CPU llama_kv_cache_unified: layer 17: dev = CPU llama_kv_cache_unified: layer 18: dev = CPU llama_kv_cache_unified: layer 19: dev = CPU llama_kv_cache_unified: layer 20: dev = CPU llama_kv_cache_unified: layer 21: dev = CPU llama_kv_cache_unified: layer 22: dev = CPU llama_kv_cache_unified: layer 23: dev = CPU llama_kv_cache_unified: layer 24: dev = CPU llama_kv_cache_unified: layer 25: dev = CPU llama_kv_cache_unified: layer 26: dev = CPU llama_kv_cache_unified: layer 27: dev = CPU llama_kv_cache_unified: layer 28: dev = CPU llama_kv_cache_unified: layer 29: dev = CPU llama_kv_cache_unified: layer 30: dev = CPU llama_kv_cache_unified: layer 31: dev = CPU llama_kv_cache_unified: layer 32: dev = CPU llama_kv_cache_unified: layer 33: dev = CPU llama_kv_cache_unified: layer 34: dev = CPU llama_kv_cache_unified: layer 35: dev = CPU llama_kv_cache_unified: CPU KV buffer size = 144.00 MiB llama_kv_cache_unified: KV self size = 144.00 MiB, K (f16): 72.00 MiB, V (f16): 72.00 MiB llama_context: enumerating backends llama_context: backend_ptrs.size() = 1 llama_context: max_nodes = 65536 llama_context: worst-case: n_tokens = 512, n_seqs = 1, n_outputs = 0 llama_context: reserving graph for n_tokens = 512, n_seqs = 1 llama_context: reserving graph for n_tokens = 1, n_seqs = 1 llama_context: reserving graph for n_tokens = 512, n_seqs = 1 llama_context: CPU compute buffer size = 300.75 MiB llama_context: graph nodes = 1338 llama_context: graph splits = 1 time=2025-08-09T22:41:53.869+02:00 level=INFO source=server.go:637 msg="llama runner started in 2.01 seconds" time=2025-08-09T22:41:53.869+02:00 level=DEBUG source=sched.go:493 msg="finished setting up" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 time=2025-08-09T22:41:53.870+02:00 level=DEBUG source=server.go:736 msg="completion request" images=0 prompt=155 format="" time=2025-08-09T22:41:53.880+02:00 level=DEBUG source=cache.go:104 msg="loading cache slot" id=0 cache=0 prompt=31 used=0 remaining=31 [GIN] 2025/08/09 - 22:41:58 | 200 | 7.284707733s | 127.0.0.1 | POST "/api/generate" time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:501 msg="context for request finished" time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:341 msg="runner with non-zero duration has gone idle, adding timer" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 duration=5m0s time=2025-08-09T22:41:58.513+02:00 level=DEBUG source=sched.go:359 msg="after processing request finished event" runner.name=registry.ollama.ai/library/qwen2.5:3b-instruct-q4_K_M runner.inference=cpu runner.devices=1 runner.size="2.3 GiB" runner.vram="0 B" runner.parallel=1 runner.pid=213592 runner.model=/home/rkonan/.ollama/models/blobs/sha256-5ee4f07cdb9beadbbb293e85803c569b01bd37ed059d2715faa7bb405f31caa6 runner.num_ctx=4096 refCount=0 |