mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 10:15:27 -04:00
fix(hwfit): serve profiles for sub-8192 context models
Allow serve-profile generation for models whose trained context window is below 8192 while preserving the 8K shrink floor for larger models.
This commit is contained in:
@@ -188,12 +188,18 @@ def compute_serve_profiles(system, model, serve_weights_gb=None, serve_quant=Non
|
|||||||
# Shrink context if even the chosen KV won't fit alongside weights.
|
# Shrink context if even the chosen KV won't fit alongside weights.
|
||||||
# Start from the smaller of the profile's target and the model's limit.
|
# Start from the smaller of the profile's target and the model's limit.
|
||||||
cur_ctx = min(ctx, model_ctx_max)
|
cur_ctx = min(ctx, model_ctx_max)
|
||||||
while cur_ctx >= 8192:
|
# Floor the context-shrink loop at 8192, but never above the model's own
|
||||||
|
# trained limit. A model with a sub-8192 context (e.g. a 2048-token
|
||||||
|
# SmolLM) starts below 8192, so a hard-coded 8192 guard skipped the loop
|
||||||
|
# entirely and produced NO profile — the serve UI then fell back to
|
||||||
|
# manual flags even though the model fits the GPU trivially.
|
||||||
|
ctx_floor = min(8192, model_ctx_max)
|
||||||
|
while cur_ctx >= ctx_floor:
|
||||||
kv = _kv_gb(model, cur_ctx, kv_type)
|
kv = _kv_gb(model, cur_ctx, kv_type)
|
||||||
n_cpu_moe, fits = _cpu_moe_for_budget(model, quant, kv, budget, fixed_gb=serve_weights_gb)
|
n_cpu_moe, fits = _cpu_moe_for_budget(model, quant, kv, budget, fixed_gb=serve_weights_gb)
|
||||||
est = _weights_gb(model, quant, serve_weights_gb) + kv + 0.6
|
est = _weights_gb(model, quant, serve_weights_gb) + kv + 0.6
|
||||||
# If a non-MoE model can't fit even fully offloaded, try less context.
|
# If a non-MoE model can't fit even fully offloaded, try less context.
|
||||||
if model.get("is_moe") or fits or cur_ctx <= 8192:
|
if model.get("is_moe") or fits or cur_ctx <= ctx_floor:
|
||||||
profiles.append({
|
profiles.append({
|
||||||
"key": key,
|
"key": key,
|
||||||
"label": label,
|
"label": label,
|
||||||
|
|||||||
@@ -81,6 +81,18 @@ def test_context_capped_at_model_limit():
|
|||||||
assert p["ctx"] <= 32768, p
|
assert p["ctx"] <= 32768, p
|
||||||
|
|
||||||
|
|
||||||
|
def test_small_context_model_still_gets_profiles():
|
||||||
|
"""A model whose trained context is below the 8192 shrink floor must still
|
||||||
|
produce serve profiles, capped at its own limit — the loop floor must not
|
||||||
|
exclude it entirely (125 of the catalog models have context_length < 8192)."""
|
||||||
|
small_ctx_model = dict(_DENSE_8B, name="SmolLM-135M", context_length=2048)
|
||||||
|
profs = compute_serve_profiles(_sys(24.0), small_ctx_model)
|
||||||
|
assert profs, "sub-8192-context model produced no profiles"
|
||||||
|
for p in profs:
|
||||||
|
assert p["ctx"] <= 2048, p # never exceeds the model's trained limit
|
||||||
|
assert p["ctx"] > 0
|
||||||
|
|
||||||
|
|
||||||
def test_no_gpu_returns_empty():
|
def test_no_gpu_returns_empty():
|
||||||
"""No VRAM detected → no GPU profiles (caller falls back to manual flags)."""
|
"""No VRAM detected → no GPU profiles (caller falls back to manual flags)."""
|
||||||
assert compute_serve_profiles({"backend": "cpu_x86", "gpu_vram_gb": 0}, _QWEN_35B_MOE) == []
|
assert compute_serve_profiles({"backend": "cpu_x86", "gpu_vram_gb": 0}, _QWEN_35B_MOE) == []
|
||||||
|
|||||||
Reference in New Issue
Block a user