mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-17 02:05:22 -04:00
Cookbook: scoring fixes, UI polish, false-finished + stale-state bug fixes
Backend (services/hwfit + routes): - rank_models picks visible set by REQUESTED column, not always score — sorting by Param now shows highest-param models PERIOD (incl. too_tight). - New fit_only param. Multi-GPU rigs filter GGUF Q*/IQ quants (vLLM/SGLang cannot serve them); default non-prequantized to BF16 on 2+ GPUs. - AWQ / GPTQ-8bit get a -1.0 quality penalty (was 0.0, tied with FP8), so FP8 wins when both fit. - Version-aware tiebreaker (parse Mn.n / Vn) — MiniMax-M2.7 ranks above M2.5 on equal composite score; >=100B integers not misread as versions. - /api/cookbook/hf-latest no longer drops models without an "NB" pattern in the repo id (MiniMax-M2.7, DeepSeek-V4-Pro etc. were silently filtered). - Cached-model scan: atexit flushes models JSON even if the script is killed mid-walk; each scan_dir wrapped in try/except; timeout 60s -> 180s. - KB granularity for sub-MB sizes (was "0 MB" for 12 KB shells). New "stalled" status for shells <1 MB with no .incomplete files. - /api/cookbook/state POST guard: rejects "done" download tasks lacking DOWNLOAD_OK / DOWNLOAD_FAILED / /snapshots/ when the last-mentioned shard is N<total — stops stale tabs from poisoning persisted state. - hf_models.json: add zai-org/GLM-5.1; flip zai-org/GLM-5 quantization Q4_K_M -> BF16 (it is the native base, not a quant). Frontend (static/js): - Scan/Download toolbar: quant defaults to All; ctx slider (8k/16k/32k/ 50k/128k/Max) ported from origin/main with sort=fit on drag, sort=score on Max. GPU toggle commits _activeCount to maxGpu on initial render. Fit column header tagged with active budget (RAM / GPU / N GPU). - Foldable Download admin-card: the Download h2 is the chevron trigger; state persists in localStorage. - Download card surfaces destination dir (Dir: <path>). Same dir on running task row, font/color matched to uptime (9px Fira Code muted, opacity .4). - Serve panel ctx text input always resets to model max on open. Sub-MB cached models show with red "download stalled" badge. - Bulk-select Cancel + Delete reset the Select button label on exit. - Cookbook running: false-finished bug fixed — DOWNLOAD_OK or /snapshots/ required; bare "Download complete" no longer marks the task done after the first config file. Clear button now sends tmux kill-session too. True overall % for multi-shard downloads: ((N-1)+frac)/total instead of hf_transfer per-shard aggregate. - Diagnosis card simplified: removed fold toggle, copy button, dismiss X. Suggestion font matches message body (12px). - HF token field flashes green check + "Saved" on save. - Cached scan no longer counts stalled rows as downloaded in Scan/Download. CSS: - dep Install button width pinned to 76px to match Installed split. - task-sub row +1px; task-status badge gets margin-right 8px. - Ctx slider styled like gallery editor sliders (thin pill rail, red thumb). - Bulk-select cancel button top -3px -> -5px.
This commit is contained in:
@@ -434,6 +434,8 @@ def _parse_serve_phase(snapshot: str, task_type: str = "serve") -> dict:
|
||||
}
|
||||
if "Application startup complete" in flat:
|
||||
return {"phase": "ready", "status": "ready"}
|
||||
if re.search(r'Ollama API ready on port\s+\d+', flat, re.I):
|
||||
return {"phase": "ready", "status": "ready"}
|
||||
# HTTP access logs (e.g. GET /v1/models 200 OK) mean the server is up and serving
|
||||
if re.search(r'(?:GET|POST)\s+/[^\s]*\s+HTTP/[\d.]+"\s*\d{3}', flat):
|
||||
return {"phase": "idle", "status": "ready"}
|
||||
|
||||
+62
-25
@@ -905,6 +905,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
# Show whether the HF token reached this server (masked) — a gated
|
||||
# model vLLM has to download will be denied without it.
|
||||
runner_lines.append(_HF_TOKEN_STATUS_SNIPPET)
|
||||
handled_ollama_serve = False
|
||||
# Auto-install inference engine if missing
|
||||
if "llama_cpp" in req.cmd or "llama-server" in req.cmd:
|
||||
# Prefer the NATIVE llama-server binary — its minja templating
|
||||
@@ -978,17 +979,48 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append('fi')
|
||||
elif "ollama" in req.cmd:
|
||||
# Ollama manages its own model store and HTTP server. Just make
|
||||
# sure the binary exists and the daemon is up before running the
|
||||
# command (the natural serving engine on Apple Silicon / Metal).
|
||||
handled_ollama_serve = True
|
||||
_ollama_port = "11434"
|
||||
_ollama_match = re.search(r"OLLAMA_HOST=[^\s:]+:(\d+)", req.cmd)
|
||||
if _ollama_match:
|
||||
_ollama_port = _ollama_match.group(1)
|
||||
# Ollama can be a host binary, a system service, or a Docker
|
||||
# container. If the HTTP API is already reachable, the model is
|
||||
# already served and we should not require a host `ollama` CLI.
|
||||
runner_lines.append(f'ODYSSEUS_OLLAMA_PORT="{_ollama_port}"')
|
||||
runner_lines.append('ODYSSEUS_OLLAMA_URL=""')
|
||||
runner_lines.append('for _ody_ollama_port in "$ODYSSEUS_OLLAMA_PORT" 11434; do')
|
||||
runner_lines.append(' [ -z "$_ody_ollama_port" ] && continue')
|
||||
runner_lines.append(' for _ody_ollama_host in 127.0.0.1 localhost host.docker.internal; do')
|
||||
runner_lines.append(' _ody_ollama_url="http://${_ody_ollama_host}:${_ody_ollama_port}"')
|
||||
runner_lines.append(' if curl -sf "$_ody_ollama_url/api/tags" >/dev/null 2>&1; then')
|
||||
runner_lines.append(' ODYSSEUS_OLLAMA_URL="$_ody_ollama_url"')
|
||||
runner_lines.append(' ODYSSEUS_OLLAMA_PORT="$_ody_ollama_port"')
|
||||
runner_lines.append(' break 2')
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append(' done')
|
||||
runner_lines.append('done')
|
||||
runner_lines.append('if [ -n "$ODYSSEUS_OLLAMA_URL" ]; then')
|
||||
runner_lines.append(' if [ "$ODYSSEUS_OLLAMA_PORT" != "' + _ollama_port + '" ]; then')
|
||||
runner_lines.append(' echo "[odysseus] Selected Ollama port ' + _ollama_port + ' was not reachable; using running Ollama on port ${ODYSSEUS_OLLAMA_PORT}."')
|
||||
runner_lines.append(' fi')
|
||||
runner_lines.append(' echo "[odysseus] Ollama API ready on port ${ODYSSEUS_OLLAMA_PORT}: ${ODYSSEUS_OLLAMA_URL}"')
|
||||
runner_lines.append(' echo "[odysseus] This task is monitoring an existing Ollama server; stopping it here will not stop an external Docker/system service."')
|
||||
runner_lines.append(' exec bash -i')
|
||||
runner_lines.append('fi')
|
||||
runner_lines.append('if ! command -v ollama &>/dev/null; then')
|
||||
runner_lines.append(' echo "ERROR: Ollama not found. Install it (macOS: brew install ollama, or https://ollama.com/download), then launch again."')
|
||||
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
|
||||
runner_lines.append('fi')
|
||||
runner_lines.append('if ! curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; then')
|
||||
runner_lines.append(' echo "Starting ollama server..."; (ollama serve >/dev/null 2>&1 &)')
|
||||
runner_lines.append(' for _ in 1 2 3 4 5 6 7 8 9 10; do curl -sf http://localhost:11434/api/tags >/dev/null 2>&1 && break; sleep 1; done')
|
||||
runner_lines.append(' echo "ERROR: Ollama not found and no Ollama API is reachable on 127.0.0.1, localhost, or host.docker.internal (ports ${ODYSSEUS_OLLAMA_PORT}/11434)."')
|
||||
runner_lines.append(' echo "Install Ollama, start an Ollama service/container on this server, or pick the port where it is already listening."')
|
||||
runner_lines.append(' echo')
|
||||
runner_lines.append(' echo "=== Process exited with code 127 ==="')
|
||||
runner_lines.append(' exec bash -i')
|
||||
runner_lines.append('fi')
|
||||
runner_lines.append('echo "Starting ollama server on 0.0.0.0:${ODYSSEUS_OLLAMA_PORT}..."')
|
||||
runner_lines.append('OLLAMA_HOST="0.0.0.0:${ODYSSEUS_OLLAMA_PORT}" ollama serve')
|
||||
runner_lines.append('_ody_exit=$?')
|
||||
runner_lines.append('echo')
|
||||
runner_lines.append('echo "=== Process exited with code ${_ody_exit} ==="')
|
||||
runner_lines.append('exec bash -i')
|
||||
elif "vllm serve" in req.cmd:
|
||||
# vLLM is CUDA/ROCm-only and does not run on macOS at all.
|
||||
runner_lines.append('if [ "$(uname -s)" = "Darwin" ]; then')
|
||||
@@ -1016,18 +1048,19 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
runner_lines.append(' ODYSSEUS_PREFLIGHT_EXIT=127')
|
||||
runner_lines.append('fi')
|
||||
|
||||
_append_serve_preflight_exit_lines(
|
||||
runner_lines,
|
||||
keep_shell_open=not local_windows,
|
||||
)
|
||||
runner_lines.append(req.cmd)
|
||||
if local_windows:
|
||||
# Detached background process — no interactive shell to keep open.
|
||||
# Print the exit marker the status poller looks for, then stop.
|
||||
_append_serve_exit_code_lines(runner_lines, keep_shell_open=False)
|
||||
else:
|
||||
# Keep shell open after exit so user can see errors
|
||||
_append_serve_exit_code_lines(runner_lines, keep_shell_open=True)
|
||||
if not handled_ollama_serve:
|
||||
_append_serve_preflight_exit_lines(
|
||||
runner_lines,
|
||||
keep_shell_open=not local_windows,
|
||||
)
|
||||
runner_lines.append(req.cmd)
|
||||
if local_windows:
|
||||
# Detached background process — no interactive shell to keep open.
|
||||
# Print the exit marker the status poller looks for, then stop.
|
||||
_append_serve_exit_code_lines(runner_lines, keep_shell_open=False)
|
||||
else:
|
||||
# Keep shell open after exit so user can see errors
|
||||
_append_serve_exit_code_lines(runner_lines, keep_shell_open=True)
|
||||
|
||||
runner_path = TMUX_LOG_DIR / f"{session_id}_run.sh"
|
||||
runner_path.write_text("\n".join(runner_lines) + "\n", encoding="utf-8")
|
||||
@@ -1692,10 +1725,14 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
|
||||
if vram_gb > 0 and needed_vram is not None and needed_vram > vram_gb:
|
||||
continue
|
||||
# Skip if no size info — without a size we can't tell if it's a real
|
||||
# full-weight model or a tiny adapter, so we'd rather drop it
|
||||
if est_vram is None:
|
||||
continue
|
||||
# Unknown-size models (e.g. MiniMax-M2.7, DeepSeek-V4-Flash) have no
|
||||
# "NB" in the repo id, so the regex above can't extract their
|
||||
# param count. Previously we dropped them entirely, which made
|
||||
# brand-new flagship releases silently vanish from this list even
|
||||
# on rigs with hundreds of GB of VRAM. Adapters/LoRAs are already
|
||||
# filtered by _is_excluded(), so what falls through here is
|
||||
# overwhelmingly full models — keep them, just without a size
|
||||
# badge (the frontend handles needed_vram_gb=null gracefully).
|
||||
|
||||
out.append({
|
||||
"repo_id": repo_id,
|
||||
|
||||
@@ -90,7 +90,7 @@ def setup_hwfit_routes():
|
||||
return detect_system(host=host, ssh_port=ssh_port, platform=platform, fresh=fresh)
|
||||
|
||||
@router.get("/models")
|
||||
def get_models(use_case: str = "", sort: str = "score", limit: int = 50, search: str = "", host: str = "", quant: str = "", gpu_count: str = "", gpu_group: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, manual_mode: str = "", manual_gpu_count: str = "", manual_vram_gb: str = "", manual_ram_gb: str = "", manual_backend: str = "", ignore_detected_gpu: bool = False, ignore_detected_ram: bool = False):
|
||||
def get_models(use_case: str = "", sort: str = "score", limit: int = 50, search: str = "", host: str = "", quant: str = "", ctx: str = "", gpu_count: str = "", gpu_group: str = "", ssh_port: str = "", platform: str = "", fresh: bool = False, manual_mode: str = "", manual_gpu_count: str = "", manual_vram_gb: str = "", manual_ram_gb: str = "", manual_backend: str = "", ignore_detected_gpu: bool = False, ignore_detected_ram: bool = False, fit_only: bool = False):
|
||||
"""Rank LLM models against detected hardware and return scored results.
|
||||
gpu_count: override GPU count (0 = CPU only, 1-N = simulate N GPUs of the
|
||||
active group). gpu_group: index into system.gpu_groups (the homogeneous
|
||||
@@ -171,7 +171,14 @@ def setup_hwfit_routes():
|
||||
# gpu_only stays off here so the default view still surfaces offload.
|
||||
_apply_group(grp, grp["count"])
|
||||
|
||||
results = rank_models(system, use_case=use_case or None, limit=limit, search=search or None, sort=sort, quant=quant or None)
|
||||
try:
|
||||
target_context = int(ctx) if ctx else None
|
||||
except ValueError:
|
||||
target_context = None
|
||||
if target_context is not None:
|
||||
target_context = max(1024, min(target_context, 1000000))
|
||||
|
||||
results = rank_models(system, use_case=use_case or None, limit=limit, search=search or None, sort=sort, quant=quant or None, target_context=target_context, fit_only=fit_only)
|
||||
return {"system": system, "models": results}
|
||||
|
||||
@router.get("/image-models")
|
||||
|
||||
Reference in New Issue
Block a user