dotfiles/config/llama-server/presets.ini

version = 1

; ─── Global ──────────────────────────────────────────────────────────────────
; Settings in [*] are inherited by every model loaded by the router.
; Per-model sections below override individual keys.
[*]

; Number of model layers to offload to GPU.
; 99 means "offload everything" — llama.cpp loads as many as fit and falls back
; to CPU automatically for any overflow. Using an explicit value avoids the
; occasional conservative auto-estimate.
; Default: auto
; n-gpu-layers = 99

; Flash Attention: reduces KV-cache VRAM usage and speeds up long-context
; inference by computing attention without materializing the full NxN matrix.
; "on" forces it; "auto" (default) enables it when CUDA is detected — same
; effect in practice, but explicit is clearer here.
; Default: auto
flash-attn = on

; Number of CPU threads used for non-GPU work: tokenization, sampling, and any
; layers that overflow VRAM during hybrid inference. ~2/3 of physical cores is
; the rule of thumb; going higher causes contention on the same cores the GPU
; DMA uses. (Machine has 12 logical cores → 8 threads.)
; Default: -1 (use all cores)
threads = 8

; Number of inference slots (parallel sequences). 1 = single-user server with
; no batching overhead. Increase only if you need concurrent requests; each
; extra slot consumes a proportional share of KV-cache VRAM.
; Default: -1 (auto, usually 1)
parallel = 1

; Jinja2 chat templating — required for models with complex chat templates
; (e.g. Qwen3, which uses raise_exception() guards). Without this, llama.cpp
; falls back to a static PEG auto-parser that can't handle those templates.
jinja = on

; Token budget for chain-of-thought reasoning.
;   -1 = unrestricted (model decides when to stop thinking)
;    0 = disable thinking entirely
;   N  = hard cap at N tokens, then force the model to answer
; Commented out: matches the default (-1 = unrestricted).
; reasoning-budget = -1

ctx-size = 32768
n-predict = 4096

; ─── Qwen3-14B ───────────────────────────────────────────────────────────────
; ~8.5 GB GGUF — fits fully in 12 GB VRAM. Fast (~12–18 tok/s). Good daily
; driver for interactive coding and Q&A.
[Qwen_Qwen3-14B-Q4_K_M]

; Full 32 K context is safe: 14B fits in VRAM with plenty of headroom for the
; KV cache. At 32 K × 2 bytes × 2 (K+V) × 40 layers ≈ ~5 GB worst-case KV.
; Default: 0 (read from model metadata, typically the training context limit)
ctx-size = 32768

; Cap generation at 4096 tokens. Prevents runaway responses; raise if you need
; longer output (documentation, large refactors). Default: -1 (unlimited)
n-predict = 4096


; ─── OmniCoder-2-9B ──────────────────────────────────────────────────────────
; ~9.4 GB GGUF — fits fully in 12 GB VRAM. Fast generation. Vision-capable
; (multimodal projector at OmniCoder-2-9B.Q8_0/mmproj-Q8_0.gguf — auto-detected
; from subdirectory layout by the router).
[OmniCoder-2-9B.Q8_0]

; Full 32 K context fits comfortably alongside 9B weights.
; Default: 0 (read from model metadata)
ctx-size = 32768

; Cap generation at 4096 tokens. Default: -1 (unlimited)
n-predict = 4096


; ─── Qwen3.6-35B-A3B (MoE + MTP) ────────────────────────────────────────────
; 13.6 GB GGUF — ~12 GB on GPU, ~1.6 GB CPU offload on a 12 GB card.
; MoE model: only ~3B parameters active per forward pass despite 35B total.
; MTP (multi-token prediction) heads baked in — uses draft-mtp speculative
; decoding to roughly double throughput vs non-speculative. Requires b9279+.
[Qwen3.6-35B-A3B-IQ3_S-3.06bpw]

; KV cache is small (~31 MiB/1K tokens) due to GQA — 32K context only needs
; ~1 GB KV cache, which pages to CPU gracefully without major throughput loss.
ctx-size = 32768

; Cap generation at 4096 tokens. Default: -1 (unlimited)
n-predict = 4096

; Multi-token prediction speculative decoding.
; spec-type = draft-mtp uses MTP heads built into the model weights.
spec-type = draft-mtp

; Minimum acceptance probability for a speculated draft token (0–1).
; 0.75 = accept tokens the model is 75%+ confident in. Lower = more aggressive
; speculation (faster but slightly more divergence risk).
spec-draft-p-min = 0.75

; Max tokens to speculate per step. 3 is the sweet spot for Qwen3.6 MTP.
spec-draft-n-max = 3


; ─── Qwen3.6-27B ─────────────────────────────────────────────────────────────
; 17 GB GGUF — ~12 GB on GPU, ~5 GB CPU offload on a 12 GB card.
; Slower (~4–8 tok/s) due to CPU↔GPU transfers; best for deep analysis tasks.
[Qwen_Qwen3.6-27B-Q4_K_M]

; Smaller context than 14B to keep the KV cache on-GPU. At 16 K the KV cache
; is roughly half the size, which reduces how much spills to CPU on each
; forward pass — meaningful when every byte of VRAM is already spoken for.
; Default: 0 (read from model metadata)
ctx-size = 16384

; Cap generation at 4096 tokens. Default: -1 (unlimited)
n-predict = 4096

[Qwopus3.6-27B-v2-MTP-Q4_K_M]

ctx-size = 32768
n-predict = 4096
spec-type = draft-mtp
spec-draft-p-min = 0.75
spec-draft-n-max = 3

[Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M]

ctx-size = 32768
n-predict = 4096
spec-type = draft-mtp
spec-draft-p-min = 0.75
spec-draft-n-max = 3

[Qwopus3.5-9B-Coder-MTP-Q8_0]

ctx-size = 65536
n-predict = 4096
spec-type = draft-mtp
spec-draft-p-min = 0.75
spec-draft-n-max = 3

[agentica-org_DeepCoder-14B-Preview-Q5_K_M]

ctx-size = 32768
n-predict = 4096