document live api stabilization rollout

This commit is contained in:
dirtydishes 2026-05-22 21:50:35 -04:00
parent 20397fdef3
commit 5a68a3e38e
2 changed files with 811 additions and 1 deletions

View file

@ -0,0 +1,810 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Turn Record: Stabilize Live API Memory</title>
<style>
:root {
color-scheme: light;
--bg: #f3f1eb;
--surface: #fffdf8;
--surface-strong: #f7f2e9;
--ink: #1f1b16;
--muted: #62584c;
--line: #d6c8b4;
--accent: #8d5a2b;
--accent-soft: rgba(141, 90, 43, 0.12);
--good: #245c3b;
--warn: #8a4b15;
--shadow: 0 24px 60px rgba(61, 44, 21, 0.08);
}
* {
box-sizing: border-box;
}
body {
margin: 0;
background:
radial-gradient(circle at top left, rgba(141, 90, 43, 0.12), transparent 32%),
linear-gradient(180deg, #f7f3ec 0%, var(--bg) 100%);
color: var(--ink);
font: 16px/1.6 "Iowan Old Style", "Palatino Linotype", "Book Antiqua", Palatino, serif;
}
main {
width: min(1120px, calc(100vw - 40px));
margin: 32px auto 56px;
}
.hero {
background: var(--surface);
border: 1px solid rgba(214, 200, 180, 0.9);
border-radius: 28px;
padding: 32px;
box-shadow: var(--shadow);
}
.eyebrow {
display: inline-flex;
align-items: center;
gap: 8px;
padding: 8px 12px;
border-radius: 999px;
background: var(--accent-soft);
color: var(--accent);
font: 600 12px/1.2 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
letter-spacing: 0.08em;
text-transform: uppercase;
}
h1,
h2,
h3 {
margin: 0;
font-family: "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
line-height: 1.1;
}
h1 {
margin-top: 16px;
font-size: clamp(2.4rem, 5vw, 4rem);
letter-spacing: -0.04em;
}
.lede {
max-width: 72ch;
margin-top: 18px;
color: var(--muted);
font-size: 1.08rem;
}
.hero-meta {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
gap: 14px;
margin-top: 28px;
}
.meta-card {
padding: 16px 18px;
border-radius: 18px;
background: var(--surface-strong);
border: 1px solid rgba(214, 200, 180, 0.9);
}
.meta-label {
display: block;
color: var(--muted);
font: 600 0.76rem/1.2 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
letter-spacing: 0.06em;
text-transform: uppercase;
}
.meta-value {
display: block;
margin-top: 8px;
font: 700 1.05rem/1.3 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
}
.grid {
display: grid;
gap: 20px;
margin-top: 20px;
}
.card {
background: var(--surface);
border: 1px solid rgba(214, 200, 180, 0.9);
border-radius: 24px;
padding: 24px;
box-shadow: var(--shadow);
}
.card h2 {
font-size: 1.35rem;
margin-bottom: 14px;
}
p {
margin: 0;
}
p + p,
ul + p,
p + ul,
ul + ul {
margin-top: 12px;
}
ul {
margin: 0;
padding-left: 20px;
}
li + li {
margin-top: 8px;
}
.callout {
padding: 16px 18px;
border-radius: 18px;
background: rgba(36, 92, 59, 0.08);
border: 1px solid rgba(36, 92, 59, 0.16);
}
.callout strong {
color: var(--good);
font-family: "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
}
.metrics {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 12px;
margin-top: 16px;
}
.metric {
padding: 14px 16px;
border-radius: 18px;
background: var(--surface-strong);
border: 1px solid rgba(214, 200, 180, 0.9);
}
.metric strong {
display: block;
font: 700 1.2rem/1.2 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
}
.metric span {
color: var(--muted);
font: 500 0.9rem/1.4 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
}
code,
pre {
font-family: "IBM Plex Mono", "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
}
code {
padding: 0.08rem 0.35rem;
border-radius: 0.45rem;
background: rgba(99, 86, 67, 0.08);
}
pre {
overflow-x: auto;
margin: 12px 0 0;
padding: 16px;
border-radius: 16px;
background: #1f1b16;
color: #f7f2e9;
font-size: 0.88rem;
line-height: 1.5;
}
.diff-grid {
display: grid;
gap: 18px;
}
.diff-shell {
border: 1px solid rgba(214, 200, 180, 0.9);
border-radius: 20px;
padding: 16px;
background: linear-gradient(180deg, #fffdf9 0%, #f7f2ea 100%);
}
.diff-shell h3 {
font-size: 1rem;
margin-bottom: 10px;
}
.diff-render {
min-height: 120px;
}
details {
margin-top: 12px;
}
summary {
cursor: pointer;
color: var(--accent);
font: 600 0.88rem/1.3 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
}
.two-up {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
gap: 20px;
}
.small {
color: var(--muted);
font-size: 0.92rem;
}
a {
color: var(--accent);
}
</style>
</head>
<body>
<main>
<section class="hero">
<span class="eyebrow">Turn Record · May 22, 2026</span>
<h1>Stabilize Live API Memory and Internal Traffic</h1>
<p class="lede">
The Islandflow live API was repeatedly getting OOM-killed on the VPS because the hot live
cache could retain oversized channel windows and rewrite whole Redis lists at high
frequency. This turn applied an immediate server-side mitigation, hardened the API cache
path in code, and rolled the changes onto the native systemd deployment.
</p>
<div class="hero-meta">
<div class="meta-card">
<span class="meta-label">Branch</span>
<span class="meta-value"><code>stabilize-live-api-memory</code></span>
</div>
<div class="meta-card">
<span class="meta-label">Beads</span>
<span class="meta-value"><code>islandflow-thp</code></span>
</div>
<div class="meta-card">
<span class="meta-label">Deployment</span>
<span class="meta-value">Native systemd user services on the VPS</span>
</div>
<div class="meta-card">
<span class="meta-label">Primary Outcome</span>
<span class="meta-value">API RSS returned to roughly 115-130 MB after rollout</span>
</div>
</div>
</section>
<div class="grid">
<section class="card">
<h2>Summary</h2>
<p>
The live API is now bounded in three layers instead of trusting environment values and
reconnect behavior. First, the VPS <code>.env</code> was reset to safer live-window
values and the oversized Redis hot-cache keys were cleared. Second, the API now clamps
generic live cache limits per channel in code. Third, generic live feed persistence now
appends deltas into Redis instead of cloning and rewriting entire lists on every flush.
</p>
<div class="callout" style="margin-top: 16px">
<strong>Observed on the VPS after rollout:</strong>
the API stayed healthy through restart, minute metrics showed much smaller cache depths,
and the kernel did not log any new Bun OOM kill after the hardened restart.
</div>
</section>
<section class="card">
<h2>Changes Made</h2>
<ul>
<li>
Added channel-specific hard caps in
<code>services/api/src/live.ts</code> so oversized
<code>LIVE_LIMIT_*</code> values are clamped before use.
</li>
<li>
Changed generic live Redis persistence from full-list rewrite behavior to append-plus-trim,
with rewrite fallback only when the in-memory ordering has to be rebuilt.
</li>
<li>
Serialized Redis flushes during shutdown so service restarts do not race with a closing
Redis client.
</li>
<li>
Added API minute-log visibility for live subscription counts, Redis flush deltas,
payload bytes, snapshot sizes, and process memory usage.
</li>
<li>
Tightened the browser-exposed live window caps in
<code>apps/web/app/terminal.tsx</code> and aligned the tracked env examples with the safer
production defaults, including <code>LIVE_LIMIT_NEWS</code>.
</li>
<li>
Applied the emergency mitigation directly on the VPS:
updated <code>/home/delta/islandflow/.env</code>, created
<code>/home/delta/islandflow/.env.backup-2026-05-22-2131</code>, deleted stale
<code>live:*</code> Redis keys, rebuilt the web app, and restarted
<code>islandflow-api.service</code> and <code>islandflow-web.service</code>.
</li>
</ul>
</section>
<section class="card">
<h2>Context</h2>
<p>
The VPS was killing <code>islandflow-api.service</code> several times on May 22, 2026.
Kernel logs showed Bun reaching roughly 8-9 GiB RSS inside the API service cgroup before
the OOM killer stepped in. The API minute logs also showed channel depths pinned at
<code>10000</code> for multiple feeds, plus massive cumulative Redis rewrite churn.
</p>
<p>
Most of the “huge bandwidth” in <code>btop</code> was local loopback traffic: Bun talking
to Redis, NATS, and ClickHouse on <code>127.0.0.1</code>. That meant the problem was not a
public-edge flood, it was the live cache architecture multiplying internal work on the box.
</p>
</section>
<section class="card">
<h2>Important Implementation Details</h2>
<div class="two-up">
<div>
<h3 style="margin-bottom: 10px">API hardening</h3>
<ul>
<li>
Hard caps now bound generic channel windows even if env values drift upward.
</li>
<li>
<code>snapshot_limit</code> is still honored, but only up to the lower of the request,
the configured limit, and the safe channel cap.
</li>
<li>
Generic feeds use incremental Redis appends; scoped candle and overlay caches still
use full rewrites because they are much smaller and keyed differently.
</li>
</ul>
</div>
<div>
<h3 style="margin-bottom: 10px">Operational changes</h3>
<ul>
<li>
The VPS now runs with a much smaller hot live footprint:
options <code>100</code>, flow <code>500</code>, alerts <code>300</code>,
news <code>100</code>.
</li>
<li>
Old Redis hot-cache keys were deleted so the API did not rehydrate oversized lists on boot.
</li>
<li>
The web app was rebuilt on the VPS checkout after switching that checkout onto
<code>stabilize-live-api-memory</code>.
</li>
</ul>
</div>
</div>
</section>
<section class="card">
<h2>Relevant Diff Snippets</h2>
<p class="small">
These snippets are rendered with the Diffs library from
<a href="https://diffs.com/docs">diffs.com</a>, with a plain-text fallback kept inline in the file.
</p>
<div class="diff-grid" style="margin-top: 18px">
<article class="diff-shell">
<h3><code>services/api/src/live.ts</code>: hard caps and append-based generic Redis flushes</h3>
<div class="diff-render" id="diff-live"></div>
<details>
<summary>Plain-text fallback</summary>
<pre>Added LIVE_GENERIC_LIMIT_CAPS, clamped env/configured limits, changed generic writes from
queueRedisWrite(items:[...items]) to queueGenericRedisWrite(item, items, forceRewrite), and split
Redis persistence into rewrite and append paths with shutdown-safe flush serialization.</pre>
</details>
</article>
<article class="diff-shell">
<h3><code>services/api/src/index.ts</code>: minute metrics now include memory and live subscription visibility</h3>
<div class="diff-render" id="diff-index"></div>
<details>
<summary>Plain-text fallback</summary>
<pre>Added buildLiveSubscriptionMetrics(), previous snapshot tracking, flush delta logging,
memory snapshots, and gauges for RSS, heap used, active sockets, and per-channel subscriptions.</pre>
</details>
</article>
<article class="diff-shell">
<h3><code>.env.example</code> and <code>apps/web/app/terminal.tsx</code>: safer default windows</h3>
<div class="diff-render" id="diff-config"></div>
<details>
<summary>Plain-text fallback</summary>
<pre>Reduced LIVE_LIMIT_OPTIONS in tracked examples to 100, added LIVE_LIMIT_NEWS=100,
and lowered the client-exposed maximum live hot windows from 100000 to 2000.</pre>
</details>
</article>
</div>
</section>
<section class="card">
<h2>Expected Impact for End-Users</h2>
<ul>
<li>
The hosted app should stop disappearing behind API restarts caused by the kernel OOM killer.
</li>
<li>
Live feeds should still feel current, but the server will retain a tighter hot window instead of
hoarding oversized in-memory histories.
</li>
<li>
The operator experience on the VPS should improve because internal loopback churn is materially lower.
</li>
</ul>
</section>
<section class="card">
<h2>Validation</h2>
<ul>
<li>
Local API test gate passed:
<code>bun test services/api/tests/live.test.ts</code>
</li>
<li>
Local web production build passed:
<code>bun --cwd=apps/web run build</code>
</li>
<li>
VPS mitigation applied successfully. Redis reported <code>1524</code> live keys removed before restart.
</li>
<li>
After mitigation restart, <code>systemctl --user status islandflow-api.service</code> showed the
API at about <code>84 MB</code> RSS instead of multi-GB startup drift.
</li>
<li>
After rolling the hardened branch onto the VPS, the API minute log at
<code>2026-05-22 21:44:11 EDT</code> showed:
</li>
</ul>
<div class="metrics">
<div class="metric">
<strong>119.6 MB</strong>
<span>API RSS from the minute memory snapshot</span>
</div>
<div class="metric">
<strong>100</strong>
<span><code>live:options</code> depth</span>
</div>
<div class="metric">
<strong>500</strong>
<span><code>live:flow</code>, <code>live:alerts</code>, and <code>live:equity-quotes</code> caps held</span>
</div>
<div class="metric">
<strong>34,559</strong>
<span>Redis flush items in that minute delta</span>
</div>
<div class="metric">
<strong>9.18 MB</strong>
<span>Redis flush payload bytes in that minute delta</span>
</div>
<div class="metric">
<strong>No new OOM</strong>
<span>Kernel logs after the hardened restart</span>
</div>
</div>
</section>
<section class="card">
<h2>Issues, Limitations, and Mitigations</h2>
<ul>
<li>
The new minute metrics are cumulative plus delta-based. They are much more useful than the old
absolute counters, but they still reset on process restart.
</li>
<li>
<code>snapshotItemsByChannel</code> remains empty when no live websocket clients are connected.
That is expected because snapshots are only recorded when a snapshot is actually served.
</li>
<li>
Quiet feeds such as news and inferred-dark can still show very old freshness ages in logs.
That reflects inactivity, not a broken hot path.
</li>
<li>
The append-based Redis path deliberately falls back to a rewrite when out-of-order live events
require the in-memory ordering to be rebuilt. That keeps correctness ahead of theoretical write minimization.
</li>
</ul>
</section>
<section class="card">
<h2>Follow-up Work</h2>
<ul>
<li>
Add explicit alerting for repeated API RSS growth and for minute-level flush deltas that jump far above the new baseline.
</li>
<li>
Decide whether quiet-channel freshness logs should suppress extremely stale values for feeds like news to reduce operator noise.
</li>
<li>
Consider moving the live cache metrics into a dashboard view so operators do not need to parse journal lines manually.
</li>
</ul>
</section>
</div>
</main>
<script type="module">
const diffs = [
{
id: "diff-live",
name: "services/api/src/live.ts",
oldContents: `const DEFAULT_LIVE_LIMITS: GenericLiveLimits = {
options: 100,
nbbo: 1000,
equities: 1000,
"equity-quotes": 500,
"equity-joins": 500,
flow: 500,
"smart-money": 300,
"classifier-hits": 300,
alerts: 300,
"inferred-dark": 300,
news: 100
};
const parseGenericLimit = (env, channel, fallback) => {
const key = GENERIC_LIMIT_ENV_KEYS[channel];
const raw = env[key];
if (!raw || raw.trim().length === 0) {
return fallback;
}
const parsed = Number(raw);
const bounded = Math.max(MIN_GENERIC_LIMIT, Math.min(MAX_GENERIC_LIMIT, Math.floor(parsed)));
return bounded;
};
type BufferedRedisWrite = {
listKey: string;
cursorField: string;
items: unknown[];
limit: number;
cursor: Cursor | null;
updates: number;
};
private queueRedisWrite(listKey, cursorField, items, limit, cursor) {
const existing = this.pendingRedisWrites.get(listKey);
const write: BufferedRedisWrite = {
listKey,
cursorField,
items: [...items],
limit,
cursor,
updates: (existing?.updates ?? 0) + 1
};
this.pendingRedisWrites.set(listKey, write);
}
private async persistList(listKey, cursorField, items, limit, cursor) {
const payloads = items.map((entry) => JSON.stringify(entry));
await this.redis.lTrim(listKey, 1, 0);
if (payloads.length > 0) {
for (let idx = payloads.length - 1; idx >= 0; idx -= 1) {
await this.redis.lPush(listKey, payloads[idx]);
}
await this.redis.lTrim(listKey, 0, limit - 1);
}
await this.redis.hSet(CURSOR_HASH_KEY, cursorField, JSON.stringify(cursor));
}`,
newContents: `export const LIVE_GENERIC_LIMIT_CAPS: GenericLiveLimits = {
options: 100,
nbbo: 1000,
equities: 1000,
"equity-quotes": 500,
"equity-joins": 500,
flow: 500,
"smart-money": 300,
"classifier-hits": 300,
alerts: 300,
"inferred-dark": 300,
news: 100
};
const clampConfiguredLimit = (channel: LiveGenericChannel, value: number): number =>
Math.max(MIN_GENERIC_LIMIT, Math.min(LIVE_GENERIC_LIMIT_CAPS[channel], Math.floor(value)));
const parseGenericLimit = (env, channel, fallback) => {
const key = GENERIC_LIMIT_ENV_KEYS[channel];
const raw = env[key];
if (!raw || raw.trim().length === 0) {
return clampConfiguredLimit(channel, fallback);
}
const parsed = Number(raw);
const bounded = clampConfiguredLimit(channel, Math.min(MAX_GENERIC_LIMIT, parsed));
return bounded;
};
type BufferedRedisRewrite = {
mode: "rewrite";
listKey: string;
cursorField: string;
items: unknown[];
limit: number;
cursor: Cursor | null;
updates: number;
};
type BufferedRedisAppend = {
mode: "append";
listKey: string;
cursorField: string;
payloads: string[];
limit: number;
cursor: Cursor | null;
updates: number;
};
private queueGenericRedisWrite(listKey, cursorField, item, items, limit, cursor, forceRewrite = false) {
const existing = this.pendingRedisWrites.get(listKey);
const nextUpdateCount = (existing?.updates ?? 0) + 1;
if (forceRewrite || existing?.mode === "rewrite") {
this.pendingRedisWrites.set(listKey, {
mode: "rewrite",
listKey,
cursorField,
items: [...items],
limit,
cursor,
updates: nextUpdateCount
});
} else {
this.pendingRedisWrites.set(listKey, {
mode: "append",
listKey,
cursorField,
payloads: [...(existing?.mode === "append" ? existing.payloads : []), JSON.stringify(item)],
limit,
cursor,
updates: nextUpdateCount
});
}
}
private async persistListAppend(listKey, cursorField, payloads, limit, cursor) {
for (const payload of payloads) {
await this.redis.lPush(listKey, payload);
}
await this.redis.lTrim(listKey, 0, limit - 1);
await this.redis.hSet(CURSOR_HASH_KEY, cursorField, JSON.stringify(cursor));
}`
},
{
id: "diff-index",
name: "services/api/src/index.ts",
oldContents: `const liveStateMetricsTimer = setInterval(() => {
const snapshot = liveState.getStatsSnapshot();
const hotFeedHealth = liveState.getHotChannelHealth();
const hotFeedLagMs = {
options: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.options] ?? null,
equities: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.equities] ?? null,
flow: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.flow] ?? null,
nbbo: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.nbbo] ?? null
};
logger.info("live cache metrics", {
...snapshot,
hotFeedLagMs,
hotFeedHealth,
snapshotSourceCounts: {
generic_cache_snapshot: snapshot.genericCacheSnapshots,
scoped_clickhouse_snapshot: snapshot.scopedClickHouseSnapshots
}
});
}, 60000);`,
newContents: `const buildLiveSubscriptionMetrics = () => {
const uniqueSubscriptionsByChannel: Partial<Record<LiveSubscription["channel"], number>> = {};
const socketFanoutByChannel: Partial<Record<LiveSubscription["channel"], number>> = {};
for (const subscription of subscriptionDefinitions.values()) {
uniqueSubscriptionsByChannel[subscription.channel] =
(uniqueSubscriptionsByChannel[subscription.channel] ?? 0) + 1;
}
for (const [key, sockets] of subscriptionSockets.entries()) {
const subscription = subscriptionDefinitions.get(key);
if (!subscription || sockets.size === 0) {
continue;
}
socketFanoutByChannel[subscription.channel] =
(socketFanoutByChannel[subscription.channel] ?? 0) + sockets.size;
}
return {
liveSocketCount: liveSocketSubscriptions.size,
uniqueSubscriptionsByChannel,
socketFanoutByChannel
};
};
let previousLiveStats = liveState.getStatsSnapshot();
let previousMemoryUsage = process.memoryUsage();
const liveStateMetricsTimer = setInterval(() => {
const snapshot = liveState.getStatsSnapshot();
const hotFeedHealth = liveState.getHotChannelHealth();
const subscriptionMetrics = buildLiveSubscriptionMetrics();
const memoryUsage = process.memoryUsage();
const flushDelta = {
redisFlushCount: snapshot.redisFlushCount - previousLiveStats.redisFlushCount,
redisFlushItems: snapshot.redisFlushItems - previousLiveStats.redisFlushItems,
redisFlushPayloadBytes: snapshot.redisFlushPayloadBytes - previousLiveStats.redisFlushPayloadBytes
};
const memorySnapshot = {
rss_bytes: memoryUsage.rss,
heap_used_bytes: memoryUsage.heapUsed,
rss_delta_bytes: memoryUsage.rss - previousMemoryUsage.rss
};
logger.info("live cache metrics", {
...snapshot,
flushDelta,
memorySnapshot,
liveSubscriptions: subscriptionMetrics
});
metrics.gauge("api.memory.rss_bytes", memoryUsage.rss);
metrics.gauge("api.live.active_sockets", subscriptionMetrics.liveSocketCount);
}, 60000);`
},
{
id: "diff-config",
name: "config excerpt",
oldContents: `// apps/web/app/terminal.tsx
const LIVE_HOT_WINDOW = parseBoundedInt(process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW, 600, 1, 100000);
const LIVE_HOT_WINDOW_OPTIONS = parseBoundedInt(
process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW_OPTIONS,
1200,
1,
100000
);
# .env.example
LIVE_LIMIT_OPTIONS=1000
LIVE_LIMIT_INFERRED_DARK=300`,
newContents: `// apps/web/app/terminal.tsx
const LIVE_HOT_WINDOW = parseBoundedInt(process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW, 600, 1, 2000);
const LIVE_HOT_WINDOW_OPTIONS = parseBoundedInt(
process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW_OPTIONS,
1200,
1,
2000
);
# .env.example
LIVE_LIMIT_OPTIONS=100
LIVE_LIMIT_INFERRED_DARK=300
LIVE_LIMIT_NEWS=100`
}
];
try {
const { FileDiff } = await import("https://esm.sh/@pierre/diffs");
for (const diff of diffs) {
const container = document.getElementById(diff.id);
if (!container) continue;
const fileDiff = new FileDiff({ theme: "github-light" });
fileDiff.render({
oldFile: { name: diff.name, contents: diff.oldContents },
newFile: { name: diff.name, contents: diff.newContents },
containerWrapper: container
});
}
} catch (error) {
console.warn("Failed to load diffs.com renderer", error);
}
</script>
</body>
</html>