Files
odysseus/scripts/agent_migration_manifest.py
T
spooky f23e2e6ffb docs: add agent migration manifest helper (#3028)
* docs: add agent migration manifest helper

* fix: use stat+streamed hash for metadata-only archive scans

When include_content is false, skip reading full file content and
only stat+stream-hash for size and sha256. Avoids spurious skipped-
content warnings and keeps large-export previews fast and clean.

Closes review feedback on PR #3028.

* fix: skip symlinked migration inputs

* fix: stream archive traversal warnings

* feat: stage conversation threads in agent migration manifests
2026-06-15 15:57:33 +09:00

636 lines
22 KiB
Python
Executable File

#!/usr/bin/env python3
"""Build a neutral agent migration manifest.
This helper is intentionally read-only. It does not import the Odysseus
application package, write to data/, call an LLM, or apply anything. It turns
common agent export shapes into a portable JSON manifest that Odysseus can
preview or import later.
"""
from __future__ import annotations
import argparse
import hashlib
import json
import mimetypes
import sys
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Iterable
SCHEMA_VERSION = "agent-migration.v1"
TEXT_EXTENSIONS = {
".cfg",
".conf",
".csv",
".json",
".log",
".md",
".markdown",
".py",
".rst",
".toml",
".txt",
".yaml",
".yml",
}
@dataclass(frozen=True)
class InputWarning:
path: str
message: str
def utc_now_iso() -> str:
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def sha256_text(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def sha256_bytes(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
def sha256_path(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()
def stable_id(kind: str, source_name: str, *parts: Any) -> str:
raw = "\x1f".join([kind, source_name, *[str(part) for part in parts]])
return f"{kind}:{hashlib.sha256(raw.encode('utf-8')).hexdigest()[:16]}"
def read_json(path: Path) -> Any:
with path.open("r", encoding="utf-8") as handle:
return json.load(handle)
def normalize_category(value: Any) -> str:
category = str(value or "fact").strip().lower()
return category or "fact"
def normalize_memory_text(item: Any) -> str:
if isinstance(item, str):
return item.strip()
if isinstance(item, dict):
for key in ("text", "content", "memory", "value"):
value = item.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
return ""
def memory_metadata(item: Any, source_path: Path, index: int) -> dict[str, Any]:
metadata: dict[str, Any] = {
"source_path": str(source_path),
"source_index": index,
}
if isinstance(item, dict):
for key in ("id", "timestamp", "created_at", "updated_at", "source", "tags", "pinned"):
if key in item:
metadata[f"source_{key}"] = item.get(key)
return metadata
def payload_items(payload: Any, keys: tuple[str, ...]) -> Any:
if isinstance(payload, dict):
for key in keys:
if isinstance(payload.get(key), list):
return payload[key]
return payload
def collect_memory_json(path: Path, source_name: str) -> tuple[list[dict[str, Any]], list[InputWarning]]:
warnings: list[InputWarning] = []
try:
payload = read_json(path)
except Exception as exc:
return [], [InputWarning(str(path), f"could not read JSON: {exc}")]
payload = payload_items(payload, ("memories", "memory", "items", "data"))
if not isinstance(payload, list):
return [], [InputWarning(str(path), "expected a JSON list or an object containing a memory list")]
items: list[dict[str, Any]] = []
seen: set[str] = set()
for index, item in enumerate(payload):
text = normalize_memory_text(item)
if not text:
warnings.append(InputWarning(str(path), f"skipped memory at index {index}: missing text"))
continue
digest = sha256_text(text.strip().lower())
if digest in seen:
warnings.append(InputWarning(str(path), f"skipped duplicate memory at index {index}"))
continue
seen.add(digest)
category = normalize_category(item.get("category") if isinstance(item, dict) else "fact")
source = str(item.get("source") or source_name) if isinstance(item, dict) else source_name
items.append(
{
"id": stable_id("memory", source_name, path, index, digest),
"kind": "memory",
"text": text,
"category": category,
"source": source,
"metadata": memory_metadata(item, path, index),
}
)
return items, warnings
def normalize_timestamp(value: Any) -> str | None:
if value is None or value == "":
return None
if isinstance(value, (int, float)):
try:
return (
datetime.fromtimestamp(float(value), timezone.utc)
.replace(microsecond=0)
.isoformat()
.replace("+00:00", "Z")
)
except (OverflowError, OSError, ValueError):
return str(value)
return str(value)
def normalize_role(value: Any) -> str:
role = str(value or "unknown").strip().lower()
if role in {"human", "user"}:
return "user"
if role in {"assistant", "ai", "bot", "model"}:
return "assistant"
if role in {"system", "tool"}:
return role
return role or "unknown"
def content_part_text(part: Any) -> str:
if isinstance(part, str):
return part
if isinstance(part, dict):
for key in ("text", "content", "value"):
value = part.get(key)
if isinstance(value, str):
return value
if part.get("type") == "text" and isinstance(part.get("text"), str):
return part["text"]
return ""
def normalize_message_text(message: dict[str, Any]) -> str:
content = message.get("content")
if isinstance(content, str):
return content
if isinstance(content, list):
return "\n".join(text for text in (content_part_text(part).strip() for part in content) if text)
if isinstance(content, dict):
parts = content.get("parts")
if isinstance(parts, list):
return "\n".join(text for text in (content_part_text(part).strip() for part in parts) if text)
for key in ("text", "content", "value"):
value = content.get(key)
if isinstance(value, str):
return value
for key in ("text", "body", "message"):
value = message.get(key)
if isinstance(value, str):
return value
return ""
def normalize_message(message: dict[str, Any]) -> dict[str, Any] | None:
author = message.get("author") if isinstance(message.get("author"), dict) else {}
role = (
message.get("role")
or message.get("sender")
or message.get("speaker")
or author.get("role")
or author.get("name")
)
text = normalize_message_text(message).strip()
if not text:
return None
normalized: dict[str, Any] = {
"role": normalize_role(role),
"text": text,
}
timestamp = normalize_timestamp(message.get("created_at") or message.get("create_time") or message.get("timestamp"))
if timestamp:
normalized["created_at"] = timestamp
message_id = message.get("id")
if message_id is not None:
normalized["source_id"] = str(message_id)
return normalized
def chatgpt_mapping_messages(conversation: dict[str, Any]) -> list[dict[str, Any]]:
mapping = conversation.get("mapping")
if not isinstance(mapping, dict):
return []
rows: list[tuple[float, int, dict[str, Any]]] = []
for index, node in enumerate(mapping.values()):
if not isinstance(node, dict) or not isinstance(node.get("message"), dict):
continue
message = node["message"]
sort_value = message.get("create_time")
try:
sort_key = float(sort_value)
except (TypeError, ValueError):
sort_key = float(index)
normalized = normalize_message(message)
if normalized:
rows.append((sort_key, index, normalized))
return [row[2] for row in sorted(rows, key=lambda row: (row[0], row[1]))]
def conversation_messages(conversation: dict[str, Any]) -> tuple[list[dict[str, Any]], str]:
mapped = chatgpt_mapping_messages(conversation)
if mapped:
return mapped, "chatgpt_mapping"
for key in ("messages", "chat_messages", "turns"):
raw_messages = conversation.get(key)
if isinstance(raw_messages, list):
messages = [
normalized
for raw in raw_messages
if isinstance(raw, dict)
for normalized in [normalize_message(raw)]
if normalized
]
return messages, key
return [], "unknown"
def conversation_title(conversation: dict[str, Any], index: int) -> str:
for key in ("title", "name", "summary"):
value = conversation.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
return f"Conversation {index + 1}"
def collect_conversation_json(
path: Path,
source_name: str,
*,
include_content: bool = False,
max_messages: int = 2000,
) -> tuple[list[dict[str, Any]], list[InputWarning]]:
warnings: list[InputWarning] = []
try:
payload = read_json(path)
except Exception as exc:
return [], [InputWarning(str(path), f"could not read JSON: {exc}")]
payload = payload_items(payload, ("conversations", "conversation", "items", "data"))
if isinstance(payload, dict):
payload = [payload]
if not isinstance(payload, list):
return [], [InputWarning(str(path), "expected a JSON list or an object containing a conversation list")]
items: list[dict[str, Any]] = []
for index, conversation in enumerate(payload):
if not isinstance(conversation, dict):
warnings.append(InputWarning(str(path), f"skipped conversation at index {index}: expected object"))
continue
messages, format_hint = conversation_messages(conversation)
if not messages:
warnings.append(InputWarning(str(path), f"skipped conversation at index {index}: no text messages found"))
continue
title = conversation_title(conversation, index)
source_id = conversation.get("id") or conversation.get("uuid") or conversation.get("conversation_id")
text_digest = sha256_text("\n".join(f"{msg['role']}:{msg['text']}" for msg in messages))
metadata: dict[str, Any] = {
"source_path": str(path),
"source_index": index,
"source_format": format_hint,
"message_count": len(messages),
"text_sha256": text_digest,
"content_included": False,
}
if source_id is not None:
metadata["source_id"] = str(source_id)
for key in ("create_time", "created_at", "update_time", "updated_at"):
timestamp = normalize_timestamp(conversation.get(key))
if timestamp:
metadata[f"source_{key}"] = timestamp
item: dict[str, Any] = {
"id": stable_id("conversation", source_name, path, source_id or index, text_digest),
"kind": "conversation_thread",
"title": title,
"source": source_name,
"metadata": metadata,
}
if include_content:
if len(messages) > max_messages:
warnings.append(
InputWarning(
str(path),
f"skipped conversation content at index {index}: over {max_messages} messages",
)
)
else:
item["messages"] = messages
item["metadata"]["content_included"] = True
items.append(item)
return items, warnings
def parse_skill_frontmatter(text: str) -> dict[str, Any]:
if not text.startswith("---"):
return {}
end = text.find("\n---", 3)
if end < 0:
return {}
frontmatter: dict[str, Any] = {}
for line in text[3:end].strip().splitlines():
if not line.strip() or line.lstrip().startswith("#") or ":" not in line:
continue
key, value = line.split(":", 1)
key = key.strip()
value = value.strip().strip('"').strip("'")
if key:
frontmatter[key] = value
return frontmatter
def collect_skill_dir(path: Path, source_name: str) -> tuple[list[dict[str, Any]], list[InputWarning]]:
warnings: list[InputWarning] = []
if path.is_symlink():
return [], [InputWarning(str(path), "skills path is a symlink; skipped")]
if not path.exists():
return [], [InputWarning(str(path), "skills directory does not exist")]
if not path.is_dir():
return [], [InputWarning(str(path), "skills path is not a directory")]
items: list[dict[str, Any]] = []
for skill_path in sorted(path.rglob("SKILL.md")):
if skill_path.is_symlink():
warnings.append(InputWarning(str(skill_path), "skipped symlinked skill file"))
continue
try:
text = skill_path.read_text(encoding="utf-8")
except Exception as exc:
warnings.append(InputWarning(str(skill_path), f"could not read skill: {exc}"))
continue
frontmatter = parse_skill_frontmatter(text)
name = str(frontmatter.get("name") or skill_path.parent.name).strip() or skill_path.parent.name
items.append(
{
"id": stable_id("skill", source_name, skill_path, sha256_text(text)),
"kind": "skill",
"name": name,
"category": str(frontmatter.get("category") or "general"),
"source": source_name,
"format": "SKILL.md",
"content": text,
"metadata": {
"source_path": str(skill_path),
"sha256": sha256_text(text),
"frontmatter": frontmatter,
},
}
)
return items, warnings
def looks_textual(path: Path) -> bool:
if path.suffix.lower() in TEXT_EXTENSIONS:
return True
guessed, _ = mimetypes.guess_type(str(path))
return bool(guessed and (guessed.startswith("text/") or guessed in {"application/json"}))
def iter_archive_dir(path: Path) -> Iterable[Path | InputWarning]:
try:
children = sorted(path.iterdir())
except Exception as exc:
yield InputWarning(str(path), f"could not scan archive directory: {exc}")
return
for child in children:
if child.is_symlink():
yield InputWarning(str(child), "skipped symlinked archive path")
continue
if child.is_file():
yield child
elif child.is_dir():
yield from iter_archive_dir(child)
def iter_archive_files(paths: Iterable[Path]) -> Iterable[Path | InputWarning]:
for path in paths:
if path.is_symlink():
yield InputWarning(str(path), "skipped symlinked archive path")
continue
if path.is_file():
yield path
elif path.is_dir():
yield from iter_archive_dir(path)
def collect_archive_paths(
paths: list[Path],
source_name: str,
*,
include_content: bool = False,
max_bytes: int = 256_000,
) -> tuple[list[dict[str, Any]], list[InputWarning]]:
warnings: list[InputWarning] = []
items: list[dict[str, Any]] = []
existing_paths: list[Path] = []
for path in paths:
if path.is_symlink():
warnings.append(InputWarning(str(path), "archive path is a symlink; skipped"))
continue
if not path.exists():
warnings.append(InputWarning(str(path), "archive path does not exist"))
continue
if not path.is_file() and not path.is_dir():
warnings.append(InputWarning(str(path), "archive path is not a file or directory"))
continue
existing_paths.append(path)
for entry in iter_archive_files(existing_paths):
if isinstance(entry, InputWarning):
warnings.append(entry)
continue
path = entry
if not looks_textual(path):
warnings.append(InputWarning(str(path), "skipped non-text archive file"))
continue
try:
st = path.stat()
except Exception as exc:
warnings.append(InputWarning(str(path), f"could not stat archive file: {exc}"))
continue
size = st.st_size
try:
file_hash = sha256_path(path)
except Exception as exc:
warnings.append(InputWarning(str(path), f"could not hash archive file: {exc}"))
continue
if include_content and size > max_bytes:
warnings.append(InputWarning(str(path), f"skipped archive content over {max_bytes} bytes"))
archive_item: dict[str, Any] = {
"id": stable_id("archive", source_name, path, file_hash),
"kind": "archive_document",
"title": path.name,
"source": source_name,
"metadata": {
"source_path": str(path),
"size_bytes": size,
"sha256": file_hash,
},
}
if include_content and size <= max_bytes:
try:
archive_item["content"] = path.read_text(encoding="utf-8")
except UnicodeDecodeError:
archive_item["content"] = path.read_text(encoding="utf-8", errors="replace")
archive_item["metadata"]["decoded_with_replacement"] = True
items.append(archive_item)
return items, warnings
def build_manifest(args) -> dict[str, Any]:
warnings: list[InputWarning] = []
items: list[dict[str, Any]] = []
for path in args.memory_json:
collected, got_warnings = collect_memory_json(path, args.source_name)
items.extend(collected)
warnings.extend(got_warnings)
for path in args.skills_dir:
collected, got_warnings = collect_skill_dir(path, args.source_name)
items.extend(collected)
warnings.extend(got_warnings)
for path in args.conversation_json:
collected, got_warnings = collect_conversation_json(
path,
args.source_name,
include_content=args.include_conversation_content,
max_messages=args.max_conversation_messages,
)
items.extend(collected)
warnings.extend(got_warnings)
if args.archive:
collected, got_warnings = collect_archive_paths(
args.archive,
args.source_name,
include_content=args.include_archive_content,
max_bytes=args.max_archive_bytes,
)
items.extend(collected)
warnings.extend(got_warnings)
counts: dict[str, int] = {}
for item in items:
counts[item["kind"]] = counts.get(item["kind"], 0) + 1
return {
"schema_version": SCHEMA_VERSION,
"generated_at": utc_now_iso(),
"source": {
"name": args.source_name,
"kind": args.source_kind,
},
"summary": {
"item_count": len(items),
"counts_by_kind": counts,
"warning_count": len(warnings),
},
"items": items,
"warnings": [{"path": warning.path, "message": warning.message} for warning in warnings],
}
def parse_args(argv: list[str] | None = None):
parser = argparse.ArgumentParser(description="Build a neutral Odysseus agent migration manifest.")
parser.add_argument("--source-name", default="agent-export", help="Human-readable source name.")
parser.add_argument("--source-kind", default="generic", help="Source adapter kind, e.g. generic, openclaw, hermes.")
parser.add_argument(
"--memory-json",
action="append",
type=Path,
default=[],
help="JSON memory export. May be a list, or an object containing memories/items/data.",
)
parser.add_argument(
"--skills-dir",
action="append",
type=Path,
default=[],
help="Directory containing SKILL.md files. Scanned recursively.",
)
parser.add_argument(
"--archive",
action="append",
type=Path,
default=[],
help="Text/Markdown/JSON file or directory to preserve as archive documents.",
)
parser.add_argument(
"--conversation-json",
action="append",
type=Path,
default=[],
help="Conversation export JSON. Supports generic message lists and ChatGPT-style conversations.json.",
)
parser.add_argument(
"--include-archive-content",
action="store_true",
help="Embed archive document content in the manifest. By default only metadata is included.",
)
parser.add_argument(
"--max-archive-bytes",
type=int,
default=256_000,
help="Maximum bytes to embed per archive file when --include-archive-content is used.",
)
parser.add_argument(
"--include-conversation-content",
action="store_true",
help="Embed normalized conversation messages. By default only thread metadata is included.",
)
parser.add_argument(
"--max-conversation-messages",
type=int,
default=2000,
help="Maximum messages to embed per conversation when --include-conversation-content is used.",
)
parser.add_argument("--output", type=Path, help="Write manifest JSON to this path instead of stdout.")
parser.add_argument("--compact", action="store_true", help="Write compact JSON without indentation.")
return parser.parse_args(argv)
def main(argv: list[str] | None = None) -> int:
args = parse_args(argv)
manifest = build_manifest(args)
text = json.dumps(manifest, ensure_ascii=False, sort_keys=True, separators=(",", ":")) if args.compact else (
json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
)
if args.output:
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(text, encoding="utf-8")
else:
sys.stdout.write(text)
return 0
if __name__ == "__main__":
raise SystemExit(main())