fix(hwfit): serve profiles for sub-8192 context models

Allow serve-profile generation for models whose trained context window is below 8192 while preserving the 8K shrink floor for larger models.
2026-06-17 10:15:27 -04:00 · 2026-06-15 11:32:22 +05:30
parent a07fe35936
commit b20cea347a
2 changed files with 20 additions and 2 deletions
@@ -188,12 +188,18 @@ def compute_serve_profiles(system, model, serve_weights_gb=None, serve_quant=Non
        # Shrink context if even the chosen KV won't fit alongside weights.
        # Start from the smaller of the profile's target and the model's limit.
        cur_ctx = min(ctx, model_ctx_max)
-        while cur_ctx >= 8192:
+        # Floor the context-shrink loop at 8192, but never above the model's own
+        # trained limit. A model with a sub-8192 context (e.g. a 2048-token
+        # SmolLM) starts below 8192, so a hard-coded 8192 guard skipped the loop
+        # entirely and produced NO profile — the serve UI then fell back to
+        # manual flags even though the model fits the GPU trivially.
+        ctx_floor = min(8192, model_ctx_max)
+        while cur_ctx >= ctx_floor:
            kv = _kv_gb(model, cur_ctx, kv_type)
            n_cpu_moe, fits = _cpu_moe_for_budget(model, quant, kv, budget, fixed_gb=serve_weights_gb)
            est = _weights_gb(model, quant, serve_weights_gb) + kv + 0.6
            # If a non-MoE model can't fit even fully offloaded, try less context.
-            if model.get("is_moe") or fits or cur_ctx <= 8192:
+            if model.get("is_moe") or fits or cur_ctx <= ctx_floor:
                profiles.append({
                    "key": key,
                    "label": label,