diff --git a/services/hwfit/profiles.py b/services/hwfit/profiles.py index 87aa147fe..337af7648 100644 --- a/services/hwfit/profiles.py +++ b/services/hwfit/profiles.py @@ -188,12 +188,18 @@ def compute_serve_profiles(system, model, serve_weights_gb=None, serve_quant=Non # Shrink context if even the chosen KV won't fit alongside weights. # Start from the smaller of the profile's target and the model's limit. cur_ctx = min(ctx, model_ctx_max) - while cur_ctx >= 8192: + # Floor the context-shrink loop at 8192, but never above the model's own + # trained limit. A model with a sub-8192 context (e.g. a 2048-token + # SmolLM) starts below 8192, so a hard-coded 8192 guard skipped the loop + # entirely and produced NO profile — the serve UI then fell back to + # manual flags even though the model fits the GPU trivially. + ctx_floor = min(8192, model_ctx_max) + while cur_ctx >= ctx_floor: kv = _kv_gb(model, cur_ctx, kv_type) n_cpu_moe, fits = _cpu_moe_for_budget(model, quant, kv, budget, fixed_gb=serve_weights_gb) est = _weights_gb(model, quant, serve_weights_gb) + kv + 0.6 # If a non-MoE model can't fit even fully offloaded, try less context. - if model.get("is_moe") or fits or cur_ctx <= 8192: + if model.get("is_moe") or fits or cur_ctx <= ctx_floor: profiles.append({ "key": key, "label": label, diff --git a/tests/test_serve_profiles.py b/tests/test_serve_profiles.py index b7b4ef10b..e612a7a83 100644 --- a/tests/test_serve_profiles.py +++ b/tests/test_serve_profiles.py @@ -81,6 +81,18 @@ def test_context_capped_at_model_limit(): assert p["ctx"] <= 32768, p +def test_small_context_model_still_gets_profiles(): + """A model whose trained context is below the 8192 shrink floor must still + produce serve profiles, capped at its own limit — the loop floor must not + exclude it entirely (125 of the catalog models have context_length < 8192).""" + small_ctx_model = dict(_DENSE_8B, name="SmolLM-135M", context_length=2048) + profs = compute_serve_profiles(_sys(24.0), small_ctx_model) + assert profs, "sub-8192-context model produced no profiles" + for p in profs: + assert p["ctx"] <= 2048, p # never exceeds the model's trained limit + assert p["ctx"] > 0 + + def test_no_gpu_returns_empty(): """No VRAM detected → no GPU profiles (caller falls back to manual flags).""" assert compute_serve_profiles({"backend": "cpu_x86", "gpu_vram_gb": 0}, _QWEN_35B_MOE) == []