version = 1 ; ─── Global ────────────────────────────────────────────────────────────────── ; Settings in [*] are inherited by every model loaded by the router. ; Per-model sections below override individual keys. [*] ; Number of model layers to offload to GPU. ; 99 means "offload everything" — llama.cpp loads as many as fit and falls back ; to CPU automatically for any overflow. Using an explicit value avoids the ; occasional conservative auto-estimate. ; Default: auto ; n-gpu-layers = 99 ; Flash Attention: reduces KV-cache VRAM usage and speeds up long-context ; inference by computing attention without materializing the full NxN matrix. ; "on" forces it; "auto" (default) enables it when CUDA is detected — same ; effect in practice, but explicit is clearer here. ; Default: auto flash-attn = on ; Number of CPU threads used for non-GPU work: tokenization, sampling, and any ; layers that overflow VRAM during hybrid inference. ~2/3 of physical cores is ; the rule of thumb; going higher causes contention on the same cores the GPU ; DMA uses. (Machine has 12 logical cores → 8 threads.) ; Default: -1 (use all cores) threads = 8 ; Number of inference slots (parallel sequences). 1 = single-user server with ; no batching overhead. Increase only if you need concurrent requests; each ; extra slot consumes a proportional share of KV-cache VRAM. ; Default: -1 (auto, usually 1) parallel = 1 ; Jinja2 chat templating — required for models with complex chat templates ; (e.g. Qwen3, which uses raise_exception() guards). Without this, llama.cpp ; falls back to a static PEG auto-parser that can't handle those templates. jinja = on ; Token budget for chain-of-thought reasoning. ; -1 = unrestricted (model decides when to stop thinking) ; 0 = disable thinking entirely ; N = hard cap at N tokens, then force the model to answer ; Commented out: matches the default (-1 = unrestricted). ; reasoning-budget = -1 ctx-size = 32768 n-predict = 4096 ; ─── Qwen3-14B ─────────────────────────────────────────────────────────────── ; ~8.5 GB GGUF — fits fully in 12 GB VRAM. Fast (~12–18 tok/s). Good daily ; driver for interactive coding and Q&A. [Qwen_Qwen3-14B-Q4_K_M] ; Full 32 K context is safe: 14B fits in VRAM with plenty of headroom for the ; KV cache. At 32 K × 2 bytes × 2 (K+V) × 40 layers ≈ ~5 GB worst-case KV. ; Default: 0 (read from model metadata, typically the training context limit) ctx-size = 32768 ; Cap generation at 4096 tokens. Prevents runaway responses; raise if you need ; longer output (documentation, large refactors). Default: -1 (unlimited) n-predict = 4096 ; ─── OmniCoder-2-9B ────────────────────────────────────────────────────────── ; ~9.4 GB GGUF — fits fully in 12 GB VRAM. Fast generation. Vision-capable ; (multimodal projector at OmniCoder-2-9B.Q8_0/mmproj-Q8_0.gguf — auto-detected ; from subdirectory layout by the router). [OmniCoder-2-9B.Q8_0] ; Full 32 K context fits comfortably alongside 9B weights. ; Default: 0 (read from model metadata) ctx-size = 32768 ; Cap generation at 4096 tokens. Default: -1 (unlimited) n-predict = 4096 ; ─── Qwen3.6-35B-A3B (MoE + MTP) ──────────────────────────────────────────── ; 13.6 GB GGUF — ~12 GB on GPU, ~1.6 GB CPU offload on a 12 GB card. ; MoE model: only ~3B parameters active per forward pass despite 35B total. ; MTP (multi-token prediction) heads baked in — uses draft-mtp speculative ; decoding to roughly double throughput vs non-speculative. Requires b9279+. [Qwen3.6-35B-A3B-IQ3_S-3.06bpw] ; KV cache is small (~31 MiB/1K tokens) due to GQA — 32K context only needs ; ~1 GB KV cache, which pages to CPU gracefully without major throughput loss. ctx-size = 32768 ; Cap generation at 4096 tokens. Default: -1 (unlimited) n-predict = 4096 ; Multi-token prediction speculative decoding. ; spec-type = draft-mtp uses MTP heads built into the model weights. spec-type = draft-mtp ; Minimum acceptance probability for a speculated draft token (0–1). ; 0.75 = accept tokens the model is 75%+ confident in. Lower = more aggressive ; speculation (faster but slightly more divergence risk). spec-draft-p-min = 0.75 ; Max tokens to speculate per step. 3 is the sweet spot for Qwen3.6 MTP. spec-draft-n-max = 3 ; ─── Qwen3.6-27B ───────────────────────────────────────────────────────────── ; 17 GB GGUF — ~12 GB on GPU, ~5 GB CPU offload on a 12 GB card. ; Slower (~4–8 tok/s) due to CPU↔GPU transfers; best for deep analysis tasks. [Qwen_Qwen3.6-27B-Q4_K_M] ; Smaller context than 14B to keep the KV cache on-GPU. At 16 K the KV cache ; is roughly half the size, which reduces how much spills to CPU on each ; forward pass — meaningful when every byte of VRAM is already spoken for. ; Default: 0 (read from model metadata) ctx-size = 16384 ; Cap generation at 4096 tokens. Default: -1 (unlimited) n-predict = 4096 [Qwopus3.6-27B-v2-MTP-Q4_K_M] ctx-size = 32768 n-predict = 4096 spec-type = draft-mtp spec-draft-p-min = 0.75 spec-draft-n-max = 3 [Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M] ctx-size = 32768 n-predict = 4096 spec-type = draft-mtp spec-draft-p-min = 0.75 spec-draft-n-max = 3 [Qwopus3.5-9B-Coder-MTP-Q8_0] ctx-size = 65536 n-predict = 4096 spec-type = draft-mtp spec-draft-p-min = 0.75 spec-draft-n-max = 3 [agentica-org_DeepCoder-14B-Preview-Q5_K_M] ctx-size = 32768 n-predict = 4096