model config, downloads, llama-server service and a README.md to explain --host
147 lines
5.8 KiB
INI
147 lines
5.8 KiB
INI
version = 1
|
||
|
||
; ─── Global ──────────────────────────────────────────────────────────────────
|
||
; Settings in [*] are inherited by every model loaded by the router.
|
||
; Per-model sections below override individual keys.
|
||
[*]
|
||
|
||
; Number of model layers to offload to GPU.
|
||
; 99 means "offload everything" — llama.cpp loads as many as fit and falls back
|
||
; to CPU automatically for any overflow. Using an explicit value avoids the
|
||
; occasional conservative auto-estimate.
|
||
; Default: auto
|
||
; n-gpu-layers = 99
|
||
|
||
; Flash Attention: reduces KV-cache VRAM usage and speeds up long-context
|
||
; inference by computing attention without materializing the full NxN matrix.
|
||
; "on" forces it; "auto" (default) enables it when CUDA is detected — same
|
||
; effect in practice, but explicit is clearer here.
|
||
; Default: auto
|
||
flash-attn = on
|
||
|
||
; Number of CPU threads used for non-GPU work: tokenization, sampling, and any
|
||
; layers that overflow VRAM during hybrid inference. ~2/3 of physical cores is
|
||
; the rule of thumb; going higher causes contention on the same cores the GPU
|
||
; DMA uses. (Machine has 12 logical cores → 8 threads.)
|
||
; Default: -1 (use all cores)
|
||
threads = 8
|
||
|
||
; Number of inference slots (parallel sequences). 1 = single-user server with
|
||
; no batching overhead. Increase only if you need concurrent requests; each
|
||
; extra slot consumes a proportional share of KV-cache VRAM.
|
||
; Default: -1 (auto, usually 1)
|
||
parallel = 1
|
||
|
||
; Jinja2 chat templating — required for models with complex chat templates
|
||
; (e.g. Qwen3, which uses raise_exception() guards). Without this, llama.cpp
|
||
; falls back to a static PEG auto-parser that can't handle those templates.
|
||
jinja = on
|
||
|
||
; Token budget for chain-of-thought reasoning.
|
||
; -1 = unrestricted (model decides when to stop thinking)
|
||
; 0 = disable thinking entirely
|
||
; N = hard cap at N tokens, then force the model to answer
|
||
; Commented out: matches the default (-1 = unrestricted).
|
||
; reasoning-budget = -1
|
||
|
||
ctx-size = 32768
|
||
n-predict = 4096
|
||
|
||
; ─── Qwen3-14B ───────────────────────────────────────────────────────────────
|
||
; ~8.5 GB GGUF — fits fully in 12 GB VRAM. Fast (~12–18 tok/s). Good daily
|
||
; driver for interactive coding and Q&A.
|
||
[Qwen_Qwen3-14B-Q4_K_M]
|
||
|
||
; Full 32 K context is safe: 14B fits in VRAM with plenty of headroom for the
|
||
; KV cache. At 32 K × 2 bytes × 2 (K+V) × 40 layers ≈ ~5 GB worst-case KV.
|
||
; Default: 0 (read from model metadata, typically the training context limit)
|
||
ctx-size = 32768
|
||
|
||
; Cap generation at 4096 tokens. Prevents runaway responses; raise if you need
|
||
; longer output (documentation, large refactors). Default: -1 (unlimited)
|
||
n-predict = 4096
|
||
|
||
|
||
; ─── OmniCoder-2-9B ──────────────────────────────────────────────────────────
|
||
; ~9.4 GB GGUF — fits fully in 12 GB VRAM. Fast generation. Vision-capable
|
||
; (multimodal projector at OmniCoder-2-9B.Q8_0/mmproj-Q8_0.gguf — auto-detected
|
||
; from subdirectory layout by the router).
|
||
[OmniCoder-2-9B.Q8_0]
|
||
|
||
; Full 32 K context fits comfortably alongside 9B weights.
|
||
; Default: 0 (read from model metadata)
|
||
ctx-size = 32768
|
||
|
||
; Cap generation at 4096 tokens. Default: -1 (unlimited)
|
||
n-predict = 4096
|
||
|
||
|
||
; ─── Qwen3.6-35B-A3B (MoE + MTP) ────────────────────────────────────────────
|
||
; 13.6 GB GGUF — ~12 GB on GPU, ~1.6 GB CPU offload on a 12 GB card.
|
||
; MoE model: only ~3B parameters active per forward pass despite 35B total.
|
||
; MTP (multi-token prediction) heads baked in — uses draft-mtp speculative
|
||
; decoding to roughly double throughput vs non-speculative. Requires b9279+.
|
||
[Qwen3.6-35B-A3B-IQ3_S-3.06bpw]
|
||
|
||
; KV cache is small (~31 MiB/1K tokens) due to GQA — 32K context only needs
|
||
; ~1 GB KV cache, which pages to CPU gracefully without major throughput loss.
|
||
ctx-size = 32768
|
||
|
||
; Cap generation at 4096 tokens. Default: -1 (unlimited)
|
||
n-predict = 4096
|
||
|
||
; Multi-token prediction speculative decoding.
|
||
; spec-type = draft-mtp uses MTP heads built into the model weights.
|
||
spec-type = draft-mtp
|
||
|
||
; Minimum acceptance probability for a speculated draft token (0–1).
|
||
; 0.75 = accept tokens the model is 75%+ confident in. Lower = more aggressive
|
||
; speculation (faster but slightly more divergence risk).
|
||
spec-draft-p-min = 0.75
|
||
|
||
; Max tokens to speculate per step. 3 is the sweet spot for Qwen3.6 MTP.
|
||
spec-draft-n-max = 3
|
||
|
||
|
||
; ─── Qwen3.6-27B ─────────────────────────────────────────────────────────────
|
||
; 17 GB GGUF — ~12 GB on GPU, ~5 GB CPU offload on a 12 GB card.
|
||
; Slower (~4–8 tok/s) due to CPU↔GPU transfers; best for deep analysis tasks.
|
||
[Qwen_Qwen3.6-27B-Q4_K_M]
|
||
|
||
; Smaller context than 14B to keep the KV cache on-GPU. At 16 K the KV cache
|
||
; is roughly half the size, which reduces how much spills to CPU on each
|
||
; forward pass — meaningful when every byte of VRAM is already spoken for.
|
||
; Default: 0 (read from model metadata)
|
||
ctx-size = 16384
|
||
|
||
; Cap generation at 4096 tokens. Default: -1 (unlimited)
|
||
n-predict = 4096
|
||
|
||
[Qwopus3.6-27B-v2-MTP-Q4_K_M]
|
||
|
||
ctx-size = 32768
|
||
n-predict = 4096
|
||
spec-type = draft-mtp
|
||
spec-draft-p-min = 0.75
|
||
spec-draft-n-max = 3
|
||
|
||
[Qwopus3.6-35B-A3B-v1-MTP-Q4_K_M]
|
||
|
||
ctx-size = 32768
|
||
n-predict = 4096
|
||
spec-type = draft-mtp
|
||
spec-draft-p-min = 0.75
|
||
spec-draft-n-max = 3
|
||
|
||
[Qwopus3.5-9B-Coder-MTP-Q8_0]
|
||
|
||
ctx-size = 65536
|
||
n-predict = 4096
|
||
spec-type = draft-mtp
|
||
spec-draft-p-min = 0.75
|
||
spec-draft-n-max = 3
|
||
|
||
[agentica-org_DeepCoder-14B-Preview-Q5_K_M]
|
||
|
||
ctx-size = 32768
|
||
n-predict = 4096 |