mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-30 00:22:10 -04:00
fix(cookbook): report dead finished downloads as completed instead of stopped (#4025)
When a download's tmux pane is gone, the status endpoint trusted only the HF-cache probe to tell completed from stopped. The probe derives its cache root from its own environment, but the download runner exports HF_HOME=<local_dir> (the #2722 fix), so custom-dir downloads land in <local_dir>/hub where the probe never looks - and ollama pulls don't touch the HF cache at all. Finished downloads were reported as stopped forever, and tasks already persisted as completed were demoted back to stopped on the next poll. This is the backend half of #3897, deliberately left out of the frontend fix in #4000. - honor the conclusive runner markers first: DOWNLOAD_OK -> completed (keeping the "Fetching 0 files" error guard), DOWNLOAD_FAILED -> error - pass the task's local_dir through to the cache probes so they check the cache the download actually wrote to, keeping the env-var fallback for default-cache downloads - move the probe scripts and marker classification into routes/cookbook_output.py (dependency-free) with behavioral tests Fixes #4017
This commit is contained in:
+23
-31
@@ -30,7 +30,10 @@ from core.platform_compat import (
|
||||
which_tool,
|
||||
)
|
||||
from routes.shell_routes import TMUX_LOG_DIR
|
||||
from routes.cookbook_output import error_aware_output_tail
|
||||
from routes.cookbook_output import (
|
||||
error_aware_output_tail, classify_dead_download,
|
||||
HF_CACHE_COMPLETE_PROBE, HF_CACHE_INCOMPLETE_PROBE,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -2636,30 +2639,20 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
def _cookbook_tasks_status_sync():
|
||||
import subprocess
|
||||
|
||||
def _download_cache_complete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
|
||||
def _download_cache_complete(repo_id: str, remote_host: str = "", ssh_port: str = "", cache_root: str = "") -> bool:
|
||||
"""Best-effort check for a completed HF cache entry.
|
||||
|
||||
tmux output can stop at a stale progress line if the pane/session
|
||||
disappears before Cookbook captures the final DOWNLOAD_OK marker.
|
||||
In that case, trust the cache shape: a snapshot directory with files
|
||||
and no *.incomplete blobs means HuggingFace finished materializing the
|
||||
model.
|
||||
model. cache_root is the task's custom download dir — the runner
|
||||
pointed HF_HOME there, so the cache lives under <cache_root>/hub,
|
||||
not wherever this probe's environment says.
|
||||
"""
|
||||
if not repo_id or "/" not in repo_id:
|
||||
return False
|
||||
py = (
|
||||
"import os,sys;"
|
||||
"repo=sys.argv[1];"
|
||||
"base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
|
||||
"d=os.path.join(base,'models--'+repo.replace('/','--'));"
|
||||
"snap=os.path.join(d,'snapshots');"
|
||||
"ok=os.path.isdir(snap) and any(os.path.isdir(os.path.join(snap,x)) and os.listdir(os.path.join(snap,x)) for x in os.listdir(snap));"
|
||||
"inc=False;"
|
||||
"blobs=os.path.join(d,'blobs');"
|
||||
"inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
|
||||
"sys.exit(0 if ok and not inc else 1)"
|
||||
)
|
||||
cmd = ["python3", "-c", py, repo_id]
|
||||
cmd = ["python3", "-c", HF_CACHE_COMPLETE_PROBE, repo_id, cache_root or ""]
|
||||
try:
|
||||
if remote_host:
|
||||
ssh_base = ["ssh"]
|
||||
@@ -2673,7 +2666,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _download_cache_incomplete(repo_id: str, remote_host: str = "", ssh_port: str = "") -> bool:
|
||||
def _download_cache_incomplete(repo_id: str, remote_host: str = "", ssh_port: str = "", cache_root: str = "") -> bool:
|
||||
"""Best-effort check for resumable HF partial blobs.
|
||||
|
||||
A lost SSH/tmux session can leave a real download still incomplete.
|
||||
@@ -2682,16 +2675,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
"""
|
||||
if not repo_id or "/" not in repo_id:
|
||||
return False
|
||||
py = (
|
||||
"import os,sys;"
|
||||
"repo=sys.argv[1];"
|
||||
"base=os.environ.get('HUGGINGFACE_HUB_CACHE') or os.path.join(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface')), 'hub');"
|
||||
"d=os.path.join(base,'models--'+repo.replace('/','--'));"
|
||||
"blobs=os.path.join(d,'blobs');"
|
||||
"inc=os.path.isdir(blobs) and any(x.endswith('.incomplete') for x in os.listdir(blobs));"
|
||||
"sys.exit(0 if inc else 1)"
|
||||
)
|
||||
cmd = ["python3", "-c", py, repo_id]
|
||||
cmd = ["python3", "-c", HF_CACHE_INCOMPLETE_PROBE, repo_id, cache_root or ""]
|
||||
try:
|
||||
if remote_host:
|
||||
ssh_base = ["ssh"]
|
||||
@@ -2896,7 +2880,7 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
and (
|
||||
".incomplete" in full_snapshot
|
||||
or bool(re.search(r'model-\d+-of-\d+\.[A-Za-z0-9_.-]+:\s+(?:[0-9]|[1-8][0-9])%', full_snapshot))
|
||||
or _download_cache_incomplete(_payload.get("repo_id") or model, remote, str(_tport or ""))
|
||||
or _download_cache_incomplete(_payload.get("repo_id") or model, remote, str(_tport or ""), _payload.get("local_dir") or "")
|
||||
)
|
||||
)
|
||||
if is_alive or (local_win_task and full_snapshot):
|
||||
@@ -2937,11 +2921,19 @@ def setup_cookbook_routes() -> APIRouter:
|
||||
else:
|
||||
status = "running"
|
||||
else:
|
||||
# Session is dead — check if it completed or crashed
|
||||
if (
|
||||
# Session is dead — check if it completed or crashed. The
|
||||
# runner markers in the retained output are conclusive
|
||||
# (DOWNLOAD_OK only prints after exit 0), so check them before
|
||||
# the cache probe, which can't see ollama pulls at all.
|
||||
marker = classify_dead_download(full_snapshot) if task_type == "download" else None
|
||||
if marker is not None:
|
||||
status, download_zero_files = marker
|
||||
if status == "completed" and not progress_text:
|
||||
progress_text = "Download complete"
|
||||
elif (
|
||||
task_type == "download"
|
||||
and not download_has_incomplete_evidence
|
||||
and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or ""))
|
||||
and _download_cache_complete(_payload.get("repo_id") or model, remote, str(_tport or ""), _payload.get("local_dir") or "")
|
||||
):
|
||||
status = "completed"
|
||||
if not progress_text:
|
||||
|
||||
Reference in New Issue
Block a user