Files
odysseus/static/js/streamingSegmenter.js
Merajul Arefin 2e37d72155 fix(chat): stop code-block button flicker during streaming (#3023)
Render streamed markdown incrementally (freeze finalized blocks,
re-render only the growing tail) instead of re-rendering the whole
message every token, which recreated every <pre> and dropped CSS :hover.
2026-06-06 04:08:54 -06:00

191 lines
7.4 KiB
JavaScript

// streamingSegmenter.js
//
// Pure logic for incremental ("block-at-a-time") streaming markdown rendering.
//
// While an assistant message streams in, re-rendering the whole accumulated
// markdown on every token is wasteful (O(N^2)) and recreates DOM nodes, which
// makes code-block hover buttons flicker. The fix is to FREEZE the leading part
// of the message that can no longer change, and only re-render the growing tail.
//
// This module answers the one hard question that makes freezing safe:
//
// Given the full markdown received so far, how many leading characters can
// be finalized without changing the rendered output?
//
// The contract callers rely on (`render` is the canonical markdown renderer):
//
// const n = splitFinalized(text, render);
// render(text.slice(0, n)) + render(text.slice(n)) === render(text)
//
// The module is intentionally DOM-free and renderer-agnostic so it can be unit
// tested in isolation and reused for any markdown renderer with no long-range
// cross-block dependencies (no reference-style links / footnotes).
//
// Known limitations (both bounded by the same mitigation):
// - cutIsRenderSafe proves only PRESENT-tense equivalence. If the renderer pairs
// an inline delimiter across a blank line (e.g. markdown.js will turn
// `*a\n\nb*` into emphasis spanning two paragraphs), a block frozen before the
// closing delimiter arrives can disagree with the final full render.
// - afterClosedFence boundaries are trusted without the equivalence check, so a
// fence the real renderer parses differently (e.g. a stray 4-backtick line) can
// be mis-detected as a close.
// Both only occur for input the renderer itself handles oddly, and both are
// transient: chat.js re-renders the finished message from source, so the settled
// output is always canonical.
// A fenced-code delimiter line: up to 3 leading spaces, then >=3 backticks or
// tildes, then an optional info string.
const FENCE_RE = /^ {0,3}(`{3,}|~{3,})(.*)$/;
/**
* Scan `text` starting at `fromOffset` — which MUST be at top level (callers only
* ever advance to a finalized boundary, never into a fence) — and collect the
* candidate cut points.
*
* @returns {{ boundaries: Array<{offset:number, afterClosedFence:boolean}>, inFence:boolean }}
* - A blank-line run at top level yields a boundary at the start of the next
* non-blank line (`afterClosedFence: false`).
* - A fence close yields a boundary just past the closing fence line
* (`afterClosedFence: true`) — such a cut is unconditionally safe, since
* nothing can ever merge into a completed code block.
*/
function findBoundaries(text, fromOffset) {
const boundaries = [];
const n = text.length;
let inFence = false;
let fenceMarker = '';
let i = fromOffset;
while (i < n) {
const nl = text.indexOf('\n', i);
const lineEnd = nl === -1 ? n : nl;
const afterNl = nl === -1 ? n : nl + 1;
const line = text.slice(i, lineEnd);
const fence = line.match(FENCE_RE);
if (fence) {
const marker = fence[1];
if (!inFence) {
inFence = true;
fenceMarker = marker;
} else if (
marker[0] === fenceMarker[0] &&
marker.length >= fenceMarker.length &&
fence[2].trim() === '' // a closing fence carries no info string
) {
inFence = false;
fenceMarker = '';
boundaries.push({ offset: afterNl, afterClosedFence: true });
}
i = afterNl;
} else if (!inFence && line.trim() === '') {
// Consume the entire run of blank lines; the boundary is the start of the
// next non-blank line so the finalized side owns the separator and the tail
// starts clean.
let j = afterNl;
while (j < n) {
const nl2 = text.indexOf('\n', j);
const lineEnd2 = nl2 === -1 ? n : nl2;
if (text.slice(j, lineEnd2).trim() !== '') break;
if (nl2 === -1) {
j = n;
break;
}
j = nl2 + 1;
}
boundaries.push({ offset: j, afterClosedFence: false });
i = j;
} else {
i = afterNl;
}
}
return { boundaries, inFence };
}
/**
* Does cutting between `before` and `after` leave the rendered output unchanged?
* This is the self-verifying safety check: it directly compares rendering the two
* sides separately against rendering them joined, so constructs that span the cut
* (loose lists, setext headings, lazy blockquote continuations, tables) are caught
* with no hand-coded grammar rules.
*
* Renderer non-determinism (e.g. mermaid ids seeded with Date.now()) can only make
* this return a false negative, never a false positive — so the bias is always
* toward under-finalizing, which is the safe direction.
*/
function cutIsRenderSafe(before, after, render) {
return render(before) + render(after) === render(before + after);
}
/**
* Return how many leading characters of `text` can be safely finalized, scanning
* forward from `committedLen` (the amount already finalized).
*
* Guarantees `render(text.slice(0, n)) + render(text.slice(n)) === render(text)`,
* and `committedLen <= n <= text.length`.
*
* @param {string} text Full markdown accumulated so far.
* @param {(src:string)=>string} render Canonical markdown renderer.
* @param {number} [committedLen=0] Characters already finalized (always a prior boundary).
* @returns {number}
*/
export function splitFinalized(text, render, committedLen = 0) {
const { boundaries } = findBoundaries(text, committedLen);
let best = committedLen;
let segStart = committedLen;
for (let k = 0; k < boundaries.length; k++) {
const { offset, afterClosedFence } = boundaries[k];
if (afterClosedFence) {
// A completed code block — always safe to freeze through here.
best = offset;
} else {
// A prose/list/table boundary. We need a following block to compare
// against (the last block must stay live, it can still grow), and the cut
// must be render-equivalent locally.
const nextOffset = k + 1 < boundaries.length ? boundaries[k + 1].offset : text.length;
const before = text.slice(segStart, offset);
const after = text.slice(offset, nextOffset);
if (after.trim() !== '' && cutIsRenderSafe(before, after, render)) {
best = offset;
}
}
segStart = offset;
}
return best;
}
/**
* If `text` begins with a fenced-code opener whose fence never closes, describe it
* so the renderer can stream the code in append-mode instead of re-rendering it.
* Returns `{ lang, contentStart }` (contentStart = offset of the first code char),
* or null when `text` does not start with a still-open fence.
*
* The opener line must be complete (terminated by a newline) so the info string /
* language is known before append-mode begins.
*/
export function describeOpenFence(text) {
const open = text.match(/^( {0,3})(`{3,}|~{3,})([^\n]*)\n/);
if (!open) return null;
const marker = open[2];
const contentStart = open[0].length;
for (let i = contentStart; i < text.length; ) {
const nl = text.indexOf('\n', i);
const line = text.slice(i, nl === -1 ? text.length : nl);
const close = line.match(/^ {0,3}(`{3,}|~{3,})\s*$/);
if (close && close[1][0] === marker[0] && close[1].length >= marker.length) {
return null; // the fence closes — let the normal finalize path handle it
}
if (nl === -1) break;
i = nl + 1;
}
const lang = (open[3] || '').trim().split(/\s+/)[0] || '';
return { lang, contentStart };
}