fix(cookbook): report dead finished downloads as completed instead of stopped (#4025)

When a download's tmux pane is gone, the status endpoint trusted only the
HF-cache probe to tell completed from stopped. The probe derives its cache
root from its own environment, but the download runner exports
HF_HOME=<local_dir> (the #2722 fix), so custom-dir downloads land in
<local_dir>/hub where the probe never looks - and ollama pulls don't touch
the HF cache at all. Finished downloads were reported as stopped forever,
and tasks already persisted as completed were demoted back to stopped on
the next poll. This is the backend half of #3897, deliberately left out of
the frontend fix in #4000.

- honor the conclusive runner markers first: DOWNLOAD_OK -> completed
  (keeping the "Fetching 0 files" error guard), DOWNLOAD_FAILED -> error
- pass the task's local_dir through to the cache probes so they check the
  cache the download actually wrote to, keeping the env-var fallback for
  default-cache downloads
- move the probe scripts and marker classification into
  routes/cookbook_output.py (dependency-free) with behavioral tests

Fixes #4017
This commit is contained in:
Ashvin
2026-06-15 11:56:55 +05:30
committed by GitHub
parent b28aa1f2c4
commit 23837f4571
3 changed files with 203 additions and 31 deletions
+23 -31
View File
@@ -30,7 +30,10 @@ from core.platform_compat import (
which_tool,
)
from routes.shell_routes import TMUX_LOG_DIR
from routes.cookbook_output import error_aware_output_tail
from routes.cookbook_output import (
error_aware_output_tail, classify_dead_download,
HF_CACHE_COMPLETE_PROBE, HF_CACHE_INCOMPLETE_PROBE,
)
logger = logging.getLogger(__name__)
@@ -2636,30 +2639,20 @@ def setup_cookbook_routes() -> APIRouter:
def _cookbook_tasks_status_sync():
import subprocess
def _download_cache_complete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
def _download_cache_complete(repo_id: str, remote_host: str = "", ssh_port: str = "", cache_root: str = "") -> bool:
"""Best-effort check for a completed HF cache entry.
tmux output can stop at a stale progress line if the pane/session
disappears before Cookbook captures the final DOWNLOAD_OK marker.
In that case, trust the cache shape: a snapshot directory with files
and no *.incomplete blobs means HuggingFace finished materializing the
model.
model. cache_root is the task's custom download dir — the runner
pointed HF_HOME there, so the cache lives under <cache_root>/hub,
not wherever this probe's environment says.
"""
if not repo_id or "/" not in repo_id:
return False
py = (
"import os,sys;"
"repo=sys.argv[1];"
"base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
"d=os.path.join(base,'models--'+repo.replace('/','--'));"
"snap=os.path.join(d,'snapshots');"
"ok=os.path.isdir(snap) and any(os.path.isdir(os.path.join(snap,x)) and os.listdir(os.path.join(snap,x)) for x in os.listdir(snap));"
"inc=False;"
"blobs=os.path.join(d,'blobs');"
"inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
"sys.exit(0 if ok and not inc else 1)"
)
cmd = ["python3", "-c", py, repo_id]
cmd = ["python3", "-c", HF_CACHE_COMPLETE_PROBE, repo_id, cache_root or ""]
try:
if remote_host:
ssh_base = ["ssh"]
@@ -2673,7 +2666,7 @@ def setup_cookbook_routes() -> APIRouter:
except Exception:
return False
def _download_cache_incomplete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
def _download_cache_incomplete(repo_id: str, remote_host: str = "", ssh_port: str = "", cache_root: str = "") -> bool:
"""Best-effort check for resumable HF partial blobs.
A lost SSH/tmux session can leave a real download still incomplete.
@@ -2682,16 +2675,7 @@ def setup_cookbook_routes() -> APIRouter:
"""
if not repo_id or "/" not in repo_id:
return False
py = (
"import os,sys;"
"repo=sys.argv[1];"
"base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
"d=os.path.join(base,'models--'+repo.replace('/','--'));"
"blobs=os.path.join(d,'blobs');"
"inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
"sys.exit(0 if inc else 1)"
)
cmd = ["python3", "-c", py, repo_id]
cmd = ["python3", "-c", HF_CACHE_INCOMPLETE_PROBE, repo_id, cache_root or ""]
try:
if remote_host:
ssh_base = ["ssh"]
@@ -2896,7 +2880,7 @@ def setup_cookbook_routes() -> APIRouter:
and (
".incomplete" in full_snapshot
or bool(re.search(r'model-\d+-of-\d+\.[A-Za-z0-9_.-]+:\s+(?:[0-9]|[1-8][0-9])%', full_snapshot))
or _download_cache_incomplete(_payload.get("repo_id") or model, remote, str(_tport or ""))
or _download_cache_incomplete(_payload.get("repo_id") or model, remote, str(_tport or ""), _payload.get("local_dir") or "")
)
)
if is_alive or (local_win_task and full_snapshot):
@@ -2937,11 +2921,19 @@ def setup_cookbook_routes() -> APIRouter:
else:
status = "running"
else:
# Session is dead — check if it completed or crashed
if (
# Session is dead — check if it completed or crashed. The
# runner markers in the retained output are conclusive
# (DOWNLOAD_OK only prints after exit 0), so check them before
# the cache probe, which can't see ollama pulls at all.
marker = classify_dead_download(full_snapshot) if task_type == "download" else None
if marker is not None:
status, download_zero_files = marker
if status == "completed" and not progress_text:
progress_text = "Download complete"
elif (
task_type == "download"
and not download_has_incomplete_evidence
and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or ""))
and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or ""), _payload.get("local_dir") or "")
):
status = "completed"
if not progress_text: