147 lines
5.8 KiB
INI
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

version = 1
; ─── Global ──────────────────────────────────────────────────────────────────
; Settings in [*] are inherited by every model loaded by the router.
; Per-model sections below override individual keys.
[*]
; Number of model layers to offload to GPU.
; 99 means "offload everything" — llama.cpp loads as many as fit and falls back
; to CPU automatically for any overflow. Using an explicit value avoids the
; occasional conservative auto-estimate.
; Default: auto
; n-gpu-layers = 99
; Flash Attention: reduces KV-cache VRAM usage and speeds up long-context
; inference by computing attention without materializing the full NxN matrix.
; "on" forces it; "auto" (default) enables it when CUDA is detected — same
; effect in practice, but explicit is clearer here.
; Default: auto
flash-attn = on
; Number of CPU threads used for non-GPU work: tokenization, sampling, and any
; layers that overflow VRAM during hybrid inference. ~2/3 of physical cores is
; the rule of thumb; going higher causes contention on the same cores the GPU
; DMA uses. (Machine has 12 logical cores → 8 threads.)
; Default: -1 (use all cores)
threads = 8
; Number of inference slots (parallel sequences). 1 = single-user server with
; no batching overhead. Increase only if you need concurrent requests; each
; extra slot consumes a proportional share of KV-cache VRAM.
; Default: -1 (auto, usually 1)
parallel = 1
; Jinja2 chat templating — required for models with complex chat templates
; (e.g. Qwen3, which uses raise_exception() guards). Without this, llama.cpp
; falls back to a static PEG auto-parser that can't handle those templates.
jinja = on
; Token budget for chain-of-thought reasoning.
; -1 = unrestricted (model decides when to stop thinking)
; 0 = disable thinking entirely
; N = hard cap at N tokens, then force the model to answer
; Commented out: matches the default (-1 = unrestricted).
; reasoning-budget = -1
ctx-size = 32768
n-predict = 4096
; ─── Qwen3-14B ───────────────────────────────────────────────────────────────
; ~8.5 GB GGUF — fits fully in 12 GB VRAM. Fast (~1218 tok/s). Good daily
; driver for interactive coding and Q&A.
[Qwen_Qwen3-14B-Q4_K_M]
; Full 32 K context is safe: 14B fits in VRAM with plenty of headroom for the
; KV cache. At 32 K × 2 bytes × 2 (K+V) × 40 layers ≈ ~5 GB worst-case KV.
; Default: 0 (read from model metadata, typically the training context limit)
ctx-size = 32768
; Cap generation at 4096 tokens. Prevents runaway responses; raise if you need
; longer output (documentation, large refactors). Default: -1 (unlimited)
n-predict = 4096
; ─── OmniCoder-2-9B ──────────────────────────────────────────────────────────
; ~9.4 GB GGUF — fits fully in 12 GB VRAM. Fast generation. Vision-capable
; (multimodal projector at OmniCoder-2-9B.Q8_0/mmproj-Q8_0.gguf — auto-detected
; from subdirectory layout by the router).
[OmniCoder-2-9B.Q8_0]
; Full 32 K context fits comfortably alongside 9B weights.
; Default: 0 (read from model metadata)
ctx-size = 32768
; Cap generation at 4096 tokens. Default: -1 (unlimited)
n-predict = 4096
; ─── Qwen3.6-35B-A3B (MoE + MTP) ────────────────────────────────────────────
; 13.6 GB GGUF — ~12 GB on GPU, ~1.6 GB CPU offload on a 12 GB card.
; MoE model: only ~3B parameters active per forward pass despite 35B total.
; MTP (multi-token prediction) heads baked in — uses draft-mtp speculative
; decoding to roughly double throughput vs non-speculative. Requires b9279+.
[Qwen3.6-35B-A3B-IQ3_S-3.06bpw]
; KV cache is small (~31 MiB/1K tokens) due to GQA — 32K context only needs
; ~1 GB KV cache, which pages to CPU gracefully without major throughput loss.
ctx-size = 32768
; Cap generation at 4096 tokens. Default: -1 (unlimited)
n-predict = 4096
; Multi-token prediction speculative decoding.
; spec-type = draft-mtp uses MTP heads built into the model weights.
spec-type = draft-mtp
; Minimum acceptance probability for a speculated draft token (01).
; 0.75 = accept tokens the model is 75%+ confident in. Lower = more aggressive
; speculation (faster but slightly more divergence risk).
spec-draft-p-min = 0.75
; Max tokens to speculate per step. 3 is the sweet spot for Qwen3.6 MTP.
spec-draft-n-max = 3
; ─── Qwen3.6-27B ─────────────────────────────────────────────────────────────
; 17 GB GGUF — ~12 GB on GPU, ~5 GB CPU offload on a 12 GB card.
; Slower (~48 tok/s) due to CPU↔GPU transfers; best for deep analysis tasks.
[Qwen_Qwen3.6-27B-Q4_K_M]
; Smaller context than 14B to keep the KV cache on-GPU. At 16 K the KV cache
; is roughly half the size, which reduces how much spills to CPU on each
; forward pass — meaningful when every byte of VRAM is already spoken for.
; Default: 0 (read from model metadata)
ctx-size = 16384
; Cap generation at 4096 tokens. Default: -1 (unlimited)
n-predict = 4096
[Qwopus3.6-27B-v2-MTP-Q4_K_M]
ctx-size = 32768
n-predict = 4096
spec-type = draft-mtp
spec-draft-p-min = 0.75
spec-draft-n-max = 3
[Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M]
ctx-size = 32768
n-predict = 4096
spec-type = draft-mtp
spec-draft-p-min = 0.75
spec-draft-n-max = 3
[Qwopus3.5-9B-Coder-MTP-Q8_0]
ctx-size = 65536
n-predict = 4096
spec-type = draft-mtp
spec-draft-p-min = 0.75
spec-draft-n-max = 3
[agentica-org_DeepCoder-14B-Preview-Q5_K_M]
ctx-size = 32768
n-predict = 4096