mirror of
https://github.com/pewdiepie-archdaemon/odysseus.git
synced 2026-06-15 17:25:26 -04:00
2e37d72155
Render streamed markdown incrementally (freeze finalized blocks, re-render only the growing tail) instead of re-rendering the whole message every token, which recreated every <pre> and dropped CSS :hover.
191 lines
7.4 KiB
JavaScript
191 lines
7.4 KiB
JavaScript
// streamingSegmenter.js
|
|
//
|
|
// Pure logic for incremental ("block-at-a-time") streaming markdown rendering.
|
|
//
|
|
// While an assistant message streams in, re-rendering the whole accumulated
|
|
// markdown on every token is wasteful (O(N^2)) and recreates DOM nodes, which
|
|
// makes code-block hover buttons flicker. The fix is to FREEZE the leading part
|
|
// of the message that can no longer change, and only re-render the growing tail.
|
|
//
|
|
// This module answers the one hard question that makes freezing safe:
|
|
//
|
|
// Given the full markdown received so far, how many leading characters can
|
|
// be finalized without changing the rendered output?
|
|
//
|
|
// The contract callers rely on (`render` is the canonical markdown renderer):
|
|
//
|
|
// const n = splitFinalized(text, render);
|
|
// render(text.slice(0, n)) + render(text.slice(n)) === render(text)
|
|
//
|
|
// The module is intentionally DOM-free and renderer-agnostic so it can be unit
|
|
// tested in isolation and reused for any markdown renderer with no long-range
|
|
// cross-block dependencies (no reference-style links / footnotes).
|
|
//
|
|
// Known limitations (both bounded by the same mitigation):
|
|
// - cutIsRenderSafe proves only PRESENT-tense equivalence. If the renderer pairs
|
|
// an inline delimiter across a blank line (e.g. markdown.js will turn
|
|
// `*a\n\nb*` into emphasis spanning two paragraphs), a block frozen before the
|
|
// closing delimiter arrives can disagree with the final full render.
|
|
// - afterClosedFence boundaries are trusted without the equivalence check, so a
|
|
// fence the real renderer parses differently (e.g. a stray 4-backtick line) can
|
|
// be mis-detected as a close.
|
|
// Both only occur for input the renderer itself handles oddly, and both are
|
|
// transient: chat.js re-renders the finished message from source, so the settled
|
|
// output is always canonical.
|
|
|
|
// A fenced-code delimiter line: up to 3 leading spaces, then >=3 backticks or
|
|
// tildes, then an optional info string.
|
|
const FENCE_RE = /^ {0,3}(`{3,}|~{3,})(.*)$/;
|
|
|
|
/**
|
|
* Scan `text` starting at `fromOffset` — which MUST be at top level (callers only
|
|
* ever advance to a finalized boundary, never into a fence) — and collect the
|
|
* candidate cut points.
|
|
*
|
|
* @returns {{ boundaries: Array<{offset:number, afterClosedFence:boolean}>, inFence:boolean }}
|
|
* - A blank-line run at top level yields a boundary at the start of the next
|
|
* non-blank line (`afterClosedFence: false`).
|
|
* - A fence close yields a boundary just past the closing fence line
|
|
* (`afterClosedFence: true`) — such a cut is unconditionally safe, since
|
|
* nothing can ever merge into a completed code block.
|
|
*/
|
|
function findBoundaries(text, fromOffset) {
|
|
const boundaries = [];
|
|
const n = text.length;
|
|
let inFence = false;
|
|
let fenceMarker = '';
|
|
let i = fromOffset;
|
|
|
|
while (i < n) {
|
|
const nl = text.indexOf('\n', i);
|
|
const lineEnd = nl === -1 ? n : nl;
|
|
const afterNl = nl === -1 ? n : nl + 1;
|
|
const line = text.slice(i, lineEnd);
|
|
const fence = line.match(FENCE_RE);
|
|
|
|
if (fence) {
|
|
const marker = fence[1];
|
|
if (!inFence) {
|
|
inFence = true;
|
|
fenceMarker = marker;
|
|
} else if (
|
|
marker[0] === fenceMarker[0] &&
|
|
marker.length >= fenceMarker.length &&
|
|
fence[2].trim() === '' // a closing fence carries no info string
|
|
) {
|
|
inFence = false;
|
|
fenceMarker = '';
|
|
boundaries.push({ offset: afterNl, afterClosedFence: true });
|
|
}
|
|
i = afterNl;
|
|
} else if (!inFence && line.trim() === '') {
|
|
// Consume the entire run of blank lines; the boundary is the start of the
|
|
// next non-blank line so the finalized side owns the separator and the tail
|
|
// starts clean.
|
|
let j = afterNl;
|
|
while (j < n) {
|
|
const nl2 = text.indexOf('\n', j);
|
|
const lineEnd2 = nl2 === -1 ? n : nl2;
|
|
if (text.slice(j, lineEnd2).trim() !== '') break;
|
|
if (nl2 === -1) {
|
|
j = n;
|
|
break;
|
|
}
|
|
j = nl2 + 1;
|
|
}
|
|
boundaries.push({ offset: j, afterClosedFence: false });
|
|
i = j;
|
|
} else {
|
|
i = afterNl;
|
|
}
|
|
}
|
|
|
|
return { boundaries, inFence };
|
|
}
|
|
|
|
/**
|
|
* Does cutting between `before` and `after` leave the rendered output unchanged?
|
|
* This is the self-verifying safety check: it directly compares rendering the two
|
|
* sides separately against rendering them joined, so constructs that span the cut
|
|
* (loose lists, setext headings, lazy blockquote continuations, tables) are caught
|
|
* with no hand-coded grammar rules.
|
|
*
|
|
* Renderer non-determinism (e.g. mermaid ids seeded with Date.now()) can only make
|
|
* this return a false negative, never a false positive — so the bias is always
|
|
* toward under-finalizing, which is the safe direction.
|
|
*/
|
|
function cutIsRenderSafe(before, after, render) {
|
|
return render(before) + render(after) === render(before + after);
|
|
}
|
|
|
|
/**
|
|
* Return how many leading characters of `text` can be safely finalized, scanning
|
|
* forward from `committedLen` (the amount already finalized).
|
|
*
|
|
* Guarantees `render(text.slice(0, n)) + render(text.slice(n)) === render(text)`,
|
|
* and `committedLen <= n <= text.length`.
|
|
*
|
|
* @param {string} text Full markdown accumulated so far.
|
|
* @param {(src:string)=>string} render Canonical markdown renderer.
|
|
* @param {number} [committedLen=0] Characters already finalized (always a prior boundary).
|
|
* @returns {number}
|
|
*/
|
|
export function splitFinalized(text, render, committedLen = 0) {
|
|
const { boundaries } = findBoundaries(text, committedLen);
|
|
|
|
let best = committedLen;
|
|
let segStart = committedLen;
|
|
|
|
for (let k = 0; k < boundaries.length; k++) {
|
|
const { offset, afterClosedFence } = boundaries[k];
|
|
|
|
if (afterClosedFence) {
|
|
// A completed code block — always safe to freeze through here.
|
|
best = offset;
|
|
} else {
|
|
// A prose/list/table boundary. We need a following block to compare
|
|
// against (the last block must stay live, it can still grow), and the cut
|
|
// must be render-equivalent locally.
|
|
const nextOffset = k + 1 < boundaries.length ? boundaries[k + 1].offset : text.length;
|
|
const before = text.slice(segStart, offset);
|
|
const after = text.slice(offset, nextOffset);
|
|
if (after.trim() !== '' && cutIsRenderSafe(before, after, render)) {
|
|
best = offset;
|
|
}
|
|
}
|
|
segStart = offset;
|
|
}
|
|
|
|
return best;
|
|
}
|
|
|
|
/**
|
|
* If `text` begins with a fenced-code opener whose fence never closes, describe it
|
|
* so the renderer can stream the code in append-mode instead of re-rendering it.
|
|
* Returns `{ lang, contentStart }` (contentStart = offset of the first code char),
|
|
* or null when `text` does not start with a still-open fence.
|
|
*
|
|
* The opener line must be complete (terminated by a newline) so the info string /
|
|
* language is known before append-mode begins.
|
|
*/
|
|
export function describeOpenFence(text) {
|
|
const open = text.match(/^( {0,3})(`{3,}|~{3,})([^\n]*)\n/);
|
|
if (!open) return null;
|
|
const marker = open[2];
|
|
const contentStart = open[0].length;
|
|
|
|
for (let i = contentStart; i < text.length; ) {
|
|
const nl = text.indexOf('\n', i);
|
|
const line = text.slice(i, nl === -1 ? text.length : nl);
|
|
const close = line.match(/^ {0,3}(`{3,}|~{3,})\s*$/);
|
|
if (close && close[1][0] === marker[0] && close[1].length >= marker.length) {
|
|
return null; // the fence closes — let the normal finalize path handle it
|
|
}
|
|
if (nl === -1) break;
|
|
i = nl + 1;
|
|
}
|
|
|
|
const lang = (open[3] || '').trim().split(/\s+/)[0] || '';
|
|
return { lang, contentStart };
|
|
}
|