Polish email and cookbook flows

2026-06-16 17:55:26 -04:00 · 2026-06-02 22:38:55 +09:00
parent 15a2662119
commit ff93a6c63b
22 changed files with 1492 additions and 218 deletions
@@ -4375,7 +4375,14 @@
  "hf_downloads": 51135,
  "hf_likes": 2,
  "release_date": "2025-09-23",
-  "_discovered": true
+  "_discovered": true,
+  "gguf_sources": [
+   {
+    "repo": "typhoon-ai/typhoon2.5-qwen3-4b-gguf",
+    "file": "typhoon2.5-qwen3-4b-q4_k_m.gguf",
+    "quant": "Q4_K_M"
+   }
+  ]
 },
 {
  "name": "JunHowie/Qwen3-4B-Instruct-2507-GPTQ-Int4",
@@ -8994,7 +9001,14 @@
  "num_experts": 128,
  "active_experts": 8,
  "active_parameters": 3339450907,
-  "_discovered": true
+  "_discovered": true,
+  "gguf_sources": [
+   {
+    "repo": "typhoon-ai/typhoon2.5-qwen3-30b-a3b-gguf",
+    "file": "typhoon2.5-qwen3-30b-a3b-q4_k_m.gguf",
+    "quant": "Q4_K_M"
+   }
+  ]
 },
 {
  "name": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ",
@@ -175,8 +175,15 @@ def _quality_score(model, quant, use_case):
    model_uc = infer_use_case(model)
    if model_uc == "coding" and use_case == "coding":
        base += 6
+    elif model_uc == "coding" and use_case in ("general", "chat"):
+        # Coder-specialized models are still useful generally, but they should
+        # not dominate the default scan. If the user wants code, the Coding
+        # filter gives them the boost above.
+        base -= 10
    if model_uc == "reasoning" and use_case == "reasoning" and pb >= 13:
        base += 5
+    elif model_uc == "reasoning" and use_case == "chat":
+        base -= 4
    if model_uc == "multimodal" and use_case == "multimodal":
        base += 6

@@ -262,7 +269,30 @@ def _quant_bits(q):
    return 0


-def analyze_model(model, system, target_quant=None, scoring_use_case=None):
+def _native_quant(model):
+    native_quant = model.get("quantization", "Q4_K_M")
+    name = (model.get("name") or "").lower()
+    fmt = (model.get("format") or "").lower()
+    text = f"{name} {fmt}"
+    if "nvfp4" in text:
+        return "NVFP4"
+    if re.search(r"(^|[-_/])fp8($|[-_/\s])", text):
+        return "FP8"
+    if "gptq" in text:
+        m = re.search(r"(?:gptq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
+        return f"GPTQ-{m.group(1)}bit" if m else "GPTQ"
+    if "awq" in text:
+        m = re.search(r"(?:awq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
+        return f"AWQ-{m.group(1)}bit" if m else "AWQ"
+    if "mlx" in text:
+        m = re.search(r"mlx[-_]?(\d{1,2})bit", text)
+        return f"mlx-{m.group(1)}bit" if m else native_quant
+    if not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text):
+        return "INT8"
+    return native_quant
+
+
+def analyze_model(model, system, target_quant=None, scoring_use_case=None, target_context=None):
    pb = params_b(model)
    if pb <= 0:
        return None
@@ -282,11 +312,14 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
    gpu_only = bool(system.get("gpu_only")) and has_gpu and gpu_vram > 0
    eff_ram = 0 if gpu_only else available_ram
    is_moe = model.get("is_moe", False)
-    ctx = model.get("context_length", 4096) or 4096
+    model_ctx = model.get("context_length", 4096) or 4096
+    try:
+        target_context = int(target_context or 0)
+    except (TypeError, ValueError):
+        target_context = 0
+    ctx = min(model_ctx, target_context) if target_context > 0 else model_ctx

-    native_quant = model.get("quantization", "Q4_K_M")
-    if "nvfp4" in (model.get("name") or "").lower():
-        native_quant = "NVFP4"
+    native_quant = _native_quant(model)
    preq = is_prequantized(model)

    # GGUF models can't be sharded across GPUs — use single GPU VRAM
@@ -355,7 +388,8 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
            "score": 0,
            "scores": {"quality": 0, "speed": 0, "fit": 0, "context": 0},
            "gguf_sources": model.get("gguf_sources", []),
-            "context_length": model.get("context_length", 4096),
+            "context_length": model_ctx,
+            "target_context": target_context or None,
        }

    run_mode, quant, fit_ctx, required_gb = result
@@ -413,8 +447,9 @@ def analyze_model(model, system, target_quant=None, scoring_use_case=None):
            "context": round(c_score, 1),
        },
        "gguf_sources": model.get("gguf_sources", []),
-        "context_length": model.get("context_length", 4096),
+        "context_length": model_ctx,
        "release_date": model.get("release_date", ""),
+        "target_context": target_context or None,
    }


@@ -431,7 +466,7 @@ SORT_KEYS = {
 }


-def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None):
+def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None, target_context=None):
    """Rank all models against detected hardware. Returns sorted list of fit results."""
    models = get_models()
    results = []
@@ -495,9 +530,7 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
    consumer_amd = system_backend == "rocm" and gpu_family == "rdna"

    for m in models:
-        native_q = m.get("quantization", "")
-        if "nvfp4" in (m.get("name") or "").lower():
-            native_q = "NVFP4"
+        native_q = _native_quant(m)

        # MLX needs the mlx_lm runtime, which Odysseus does not generate serve
        # commands for. Hide it on every backend, including Metal.
@@ -548,7 +581,7 @@ def rank_models(system, use_case=None, limit=50, search=None, sort="score", quan
            if search.lower() not in name and search.lower() not in provider:
                continue

-        result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general"))
+        result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general"), target_context=target_context)
        if result is None:
            continue

@@ -101,7 +101,16 @@ def _normalize_model_entry(model):

 def is_prequantized(model):
    q = model.get("quantization", "")
-    return any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
+    name = (model.get("name") or "").lower()
+    fmt = (model.get("format") or "").lower()
+    text = f"{name} {fmt}"
+    return (
+        "nvfp4" in text
+        or re.search(r"(^|[-_/])fp8($|[-_/\s])", text) is not None
+        or (not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text) is not None)
+        or any(x in text for x in ("awq", "gptq", "mlx"))
+        or any(q.startswith(p) for p in PREQUANTIZED_PREFIXES)
+    )


 def params_b(model):