document live api stabilization rollout
This commit is contained in:
parent
20397fdef3
commit
5a68a3e38e
2 changed files with 811 additions and 1 deletions
|
|
@ -1,4 +1,4 @@
|
|||
{"_type":"issue","id":"islandflow-thp","title":"stabilize live api memory and reduce internal cache churn","description":"The native VPS deployment is repeatedly OOM-killing islandflow-api.service during live operation. The API live cache is retaining oversized channel histories and rewriting large Redis lists on every flush, which drives multi-GB Bun RSS and heavy loopback traffic between the API, Redis, NATS, and ClickHouse. Implement an emergency VPS mitigation plus repo hardening so unsafe env values, reconnect snapshots, and Redis persistence patterns cannot push the live API back into OOM.","acceptance_criteria":"1. VPS live cache env values are reduced to safe defaults and live redis state is cleared before restart. 2. services/api/src/live.ts enforces server-side live cache caps and clamps snapshot_limit accordingly. 3. Hot generic feed Redis persistence no longer rewrites entire lists on every flush. 4. Metrics/logging expose subscription counts, snapshot sizes, redis flush volume, and API memory trend. 5. Relevant tests pass and the deployment is restarted successfully.","notes":"Implemented local hardening for API live-state limits, incremental generic Redis persistence, live subscription/memory metrics, and safer client/env defaults. Targeted API live tests and the web production build both passed.","status":"in_progress","priority":1,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-23T01:30:43Z","created_by":"dirtydishes","updated_at":"2026-05-23T01:39:57Z","started_at":"2026-05-23T01:30:52Z","dependency_count":0,"dependent_count":0,"comment_count":0}
|
||||
{"_type":"issue","id":"islandflow-thp","title":"stabilize live api memory and reduce internal cache churn","description":"The native VPS deployment is repeatedly OOM-killing islandflow-api.service during live operation. The API live cache is retaining oversized channel histories and rewriting large Redis lists on every flush, which drives multi-GB Bun RSS and heavy loopback traffic between the API, Redis, NATS, and ClickHouse. Implement an emergency VPS mitigation plus repo hardening so unsafe env values, reconnect snapshots, and Redis persistence patterns cannot push the live API back into OOM.","acceptance_criteria":"1. VPS live cache env values are reduced to safe defaults and live redis state is cleared before restart. 2. services/api/src/live.ts enforces server-side live cache caps and clamps snapshot_limit accordingly. 3. Hot generic feed Redis persistence no longer rewrites entire lists on every flush. 4. Metrics/logging expose subscription counts, snapshot sizes, redis flush volume, and API memory trend. 5. Relevant tests pass and the deployment is restarted successfully.","notes":"Implemented and deployed the live-state hardening to the VPS. Final validation after restart showed the API around 120 MB RSS with capped live cache depths and clean systemd restarts.","status":"in_progress","priority":1,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-23T01:30:43Z","created_by":"dirtydishes","updated_at":"2026-05-23T01:50:29Z","started_at":"2026-05-23T01:30:52Z","dependency_count":0,"dependent_count":0,"comment_count":0}
|
||||
{"_type":"issue","id":"islandflow-sc6","title":"fix electron codex bridge preload loading","description":"Electron settings showed the browser-only Desktop Required fallback because the renderer did not see the native islandflowDesktop preload bridge or an Electron user-agent marker. Fix the desktop launch path so ChatGPT/Codex subscription controls are available inside Islandflow Desktop again.","notes":"Reopened after live Electron still showed the browser-only fallback. Follow-up fix adds an explicit preload runtime marker and web runtime detection for that marker so Electron is recognized even when the bridge is not ready and the user agent lacks an Electron token.","status":"closed","priority":1,"issue_type":"bug","owner":"dishes@dpdrm.com","created_at":"2026-05-20T23:42:58Z","created_by":"dirtydishes","updated_at":"2026-05-20T23:51:43Z","closed_at":"2026-05-20T23:51:43Z","close_reason":"Follow-up fix added an explicit islandflowDesktopRuntime preload marker and taught the web runtime to recognize that marker plus IslandflowDesktop user-agent tokens, so Electron no longer falls into the browser-only fallback when the AI bridge is delayed or unavailable. Desktop build and focused desktop/web tests pass; full web build still blocked by islandflow-c8f.","dependency_count":0,"dependent_count":0,"comment_count":0}
|
||||
{"_type":"issue","id":"islandflow-hj3","title":"Fix Electron preload for desktop AI bridge","description":"## Why\\nThe desktop settings page reports the native AI bridge as unavailable because Electron fails to load the preload script in local dev.\\n\\n## What\\nUpdate the desktop preload implementation/build so Electron can execute it, restore window.islandflowDesktop, and verify the Copilot settings panel detects the bridge again.\\n\\n## Acceptance Criteria\\n- Electron no longer logs a preload syntax error\\n- window.islandflowDesktop is available in the desktop renderer\\n- The settings page no longer shows bridge unavailable solely because preload failed\\n- Relevant desktop/web tests pass","status":"closed","priority":1,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-20T23:16:39Z","created_by":"dirtydishes","updated_at":"2026-05-20T23:20:20Z","started_at":"2026-05-20T23:16:48Z","closed_at":"2026-05-20T23:20:20Z","close_reason":"Closed","dependency_count":0,"dependent_count":0,"comment_count":0}
|
||||
{"_type":"issue","id":"islandflow-199","title":"fix desktop copilot fallback inside electron","description":"## Why\\nThe settings page can render the browser-only fallback even when Islandflow is running inside the Electron desktop shell.\\n\\n## What\\nSeparate desktop-shell detection from desktop AI transport state, make the provider recover if the bridge appears late or initial state loading fails, and cover the regression with tests.\\n\\n## Acceptance Criteria\\n- The desktop shell no longer shows the browser-only fallback solely because initial bridge state failed or arrived late\\n- Desktop-only actions can distinguish between missing Electron bridge and transport/auth problems\\n- Automated tests cover the recovery behavior","status":"closed","priority":1,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-20T22:30:16Z","created_by":"dirtydishes","updated_at":"2026-05-20T22:37:21Z","started_at":"2026-05-20T22:30:23Z","closed_at":"2026-05-20T22:37:21Z","close_reason":"Fixed desktop-shell Copilot fallback handling, added bridge recovery logic, updated desktop-vs-bridge UI messaging, and added regression tests. Follow-up tracked in islandflow-c8f for unrelated web build blocker.","dependency_count":0,"dependent_count":0,"comment_count":0}
|
||||
|
|
|
|||
810
docs/turns/2026-05-22-stabilize-live-api-memory.html
Normal file
810
docs/turns/2026-05-22-stabilize-live-api-memory.html
Normal file
|
|
@ -0,0 +1,810 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>Turn Record: Stabilize Live API Memory</title>
|
||||
<style>
|
||||
:root {
|
||||
color-scheme: light;
|
||||
--bg: #f3f1eb;
|
||||
--surface: #fffdf8;
|
||||
--surface-strong: #f7f2e9;
|
||||
--ink: #1f1b16;
|
||||
--muted: #62584c;
|
||||
--line: #d6c8b4;
|
||||
--accent: #8d5a2b;
|
||||
--accent-soft: rgba(141, 90, 43, 0.12);
|
||||
--good: #245c3b;
|
||||
--warn: #8a4b15;
|
||||
--shadow: 0 24px 60px rgba(61, 44, 21, 0.08);
|
||||
}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
margin: 0;
|
||||
background:
|
||||
radial-gradient(circle at top left, rgba(141, 90, 43, 0.12), transparent 32%),
|
||||
linear-gradient(180deg, #f7f3ec 0%, var(--bg) 100%);
|
||||
color: var(--ink);
|
||||
font: 16px/1.6 "Iowan Old Style", "Palatino Linotype", "Book Antiqua", Palatino, serif;
|
||||
}
|
||||
|
||||
main {
|
||||
width: min(1120px, calc(100vw - 40px));
|
||||
margin: 32px auto 56px;
|
||||
}
|
||||
|
||||
.hero {
|
||||
background: var(--surface);
|
||||
border: 1px solid rgba(214, 200, 180, 0.9);
|
||||
border-radius: 28px;
|
||||
padding: 32px;
|
||||
box-shadow: var(--shadow);
|
||||
}
|
||||
|
||||
.eyebrow {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
padding: 8px 12px;
|
||||
border-radius: 999px;
|
||||
background: var(--accent-soft);
|
||||
color: var(--accent);
|
||||
font: 600 12px/1.2 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
letter-spacing: 0.08em;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
|
||||
h1,
|
||||
h2,
|
||||
h3 {
|
||||
margin: 0;
|
||||
font-family: "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
line-height: 1.1;
|
||||
}
|
||||
|
||||
h1 {
|
||||
margin-top: 16px;
|
||||
font-size: clamp(2.4rem, 5vw, 4rem);
|
||||
letter-spacing: -0.04em;
|
||||
}
|
||||
|
||||
.lede {
|
||||
max-width: 72ch;
|
||||
margin-top: 18px;
|
||||
color: var(--muted);
|
||||
font-size: 1.08rem;
|
||||
}
|
||||
|
||||
.hero-meta {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
|
||||
gap: 14px;
|
||||
margin-top: 28px;
|
||||
}
|
||||
|
||||
.meta-card {
|
||||
padding: 16px 18px;
|
||||
border-radius: 18px;
|
||||
background: var(--surface-strong);
|
||||
border: 1px solid rgba(214, 200, 180, 0.9);
|
||||
}
|
||||
|
||||
.meta-label {
|
||||
display: block;
|
||||
color: var(--muted);
|
||||
font: 600 0.76rem/1.2 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
letter-spacing: 0.06em;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
|
||||
.meta-value {
|
||||
display: block;
|
||||
margin-top: 8px;
|
||||
font: 700 1.05rem/1.3 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
.grid {
|
||||
display: grid;
|
||||
gap: 20px;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
.card {
|
||||
background: var(--surface);
|
||||
border: 1px solid rgba(214, 200, 180, 0.9);
|
||||
border-radius: 24px;
|
||||
padding: 24px;
|
||||
box-shadow: var(--shadow);
|
||||
}
|
||||
|
||||
.card h2 {
|
||||
font-size: 1.35rem;
|
||||
margin-bottom: 14px;
|
||||
}
|
||||
|
||||
p {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
p + p,
|
||||
ul + p,
|
||||
p + ul,
|
||||
ul + ul {
|
||||
margin-top: 12px;
|
||||
}
|
||||
|
||||
ul {
|
||||
margin: 0;
|
||||
padding-left: 20px;
|
||||
}
|
||||
|
||||
li + li {
|
||||
margin-top: 8px;
|
||||
}
|
||||
|
||||
.callout {
|
||||
padding: 16px 18px;
|
||||
border-radius: 18px;
|
||||
background: rgba(36, 92, 59, 0.08);
|
||||
border: 1px solid rgba(36, 92, 59, 0.16);
|
||||
}
|
||||
|
||||
.callout strong {
|
||||
color: var(--good);
|
||||
font-family: "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
.metrics {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
||||
gap: 12px;
|
||||
margin-top: 16px;
|
||||
}
|
||||
|
||||
.metric {
|
||||
padding: 14px 16px;
|
||||
border-radius: 18px;
|
||||
background: var(--surface-strong);
|
||||
border: 1px solid rgba(214, 200, 180, 0.9);
|
||||
}
|
||||
|
||||
.metric strong {
|
||||
display: block;
|
||||
font: 700 1.2rem/1.2 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
.metric span {
|
||||
color: var(--muted);
|
||||
font: 500 0.9rem/1.4 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
code,
|
||||
pre {
|
||||
font-family: "IBM Plex Mono", "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
|
||||
}
|
||||
|
||||
code {
|
||||
padding: 0.08rem 0.35rem;
|
||||
border-radius: 0.45rem;
|
||||
background: rgba(99, 86, 67, 0.08);
|
||||
}
|
||||
|
||||
pre {
|
||||
overflow-x: auto;
|
||||
margin: 12px 0 0;
|
||||
padding: 16px;
|
||||
border-radius: 16px;
|
||||
background: #1f1b16;
|
||||
color: #f7f2e9;
|
||||
font-size: 0.88rem;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.diff-grid {
|
||||
display: grid;
|
||||
gap: 18px;
|
||||
}
|
||||
|
||||
.diff-shell {
|
||||
border: 1px solid rgba(214, 200, 180, 0.9);
|
||||
border-radius: 20px;
|
||||
padding: 16px;
|
||||
background: linear-gradient(180deg, #fffdf9 0%, #f7f2ea 100%);
|
||||
}
|
||||
|
||||
.diff-shell h3 {
|
||||
font-size: 1rem;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.diff-render {
|
||||
min-height: 120px;
|
||||
}
|
||||
|
||||
details {
|
||||
margin-top: 12px;
|
||||
}
|
||||
|
||||
summary {
|
||||
cursor: pointer;
|
||||
color: var(--accent);
|
||||
font: 600 0.88rem/1.3 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
.two-up {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
|
||||
gap: 20px;
|
||||
}
|
||||
|
||||
.small {
|
||||
color: var(--muted);
|
||||
font-size: 0.92rem;
|
||||
}
|
||||
|
||||
a {
|
||||
color: var(--accent);
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<main>
|
||||
<section class="hero">
|
||||
<span class="eyebrow">Turn Record · May 22, 2026</span>
|
||||
<h1>Stabilize Live API Memory and Internal Traffic</h1>
|
||||
<p class="lede">
|
||||
The Islandflow live API was repeatedly getting OOM-killed on the VPS because the hot live
|
||||
cache could retain oversized channel windows and rewrite whole Redis lists at high
|
||||
frequency. This turn applied an immediate server-side mitigation, hardened the API cache
|
||||
path in code, and rolled the changes onto the native systemd deployment.
|
||||
</p>
|
||||
<div class="hero-meta">
|
||||
<div class="meta-card">
|
||||
<span class="meta-label">Branch</span>
|
||||
<span class="meta-value"><code>stabilize-live-api-memory</code></span>
|
||||
</div>
|
||||
<div class="meta-card">
|
||||
<span class="meta-label">Beads</span>
|
||||
<span class="meta-value"><code>islandflow-thp</code></span>
|
||||
</div>
|
||||
<div class="meta-card">
|
||||
<span class="meta-label">Deployment</span>
|
||||
<span class="meta-value">Native systemd user services on the VPS</span>
|
||||
</div>
|
||||
<div class="meta-card">
|
||||
<span class="meta-label">Primary Outcome</span>
|
||||
<span class="meta-value">API RSS returned to roughly 115-130 MB after rollout</span>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<div class="grid">
|
||||
<section class="card">
|
||||
<h2>Summary</h2>
|
||||
<p>
|
||||
The live API is now bounded in three layers instead of trusting environment values and
|
||||
reconnect behavior. First, the VPS <code>.env</code> was reset to safer live-window
|
||||
values and the oversized Redis hot-cache keys were cleared. Second, the API now clamps
|
||||
generic live cache limits per channel in code. Third, generic live feed persistence now
|
||||
appends deltas into Redis instead of cloning and rewriting entire lists on every flush.
|
||||
</p>
|
||||
<div class="callout" style="margin-top: 16px">
|
||||
<strong>Observed on the VPS after rollout:</strong>
|
||||
the API stayed healthy through restart, minute metrics showed much smaller cache depths,
|
||||
and the kernel did not log any new Bun OOM kill after the hardened restart.
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Changes Made</h2>
|
||||
<ul>
|
||||
<li>
|
||||
Added channel-specific hard caps in
|
||||
<code>services/api/src/live.ts</code> so oversized
|
||||
<code>LIVE_LIMIT_*</code> values are clamped before use.
|
||||
</li>
|
||||
<li>
|
||||
Changed generic live Redis persistence from full-list rewrite behavior to append-plus-trim,
|
||||
with rewrite fallback only when the in-memory ordering has to be rebuilt.
|
||||
</li>
|
||||
<li>
|
||||
Serialized Redis flushes during shutdown so service restarts do not race with a closing
|
||||
Redis client.
|
||||
</li>
|
||||
<li>
|
||||
Added API minute-log visibility for live subscription counts, Redis flush deltas,
|
||||
payload bytes, snapshot sizes, and process memory usage.
|
||||
</li>
|
||||
<li>
|
||||
Tightened the browser-exposed live window caps in
|
||||
<code>apps/web/app/terminal.tsx</code> and aligned the tracked env examples with the safer
|
||||
production defaults, including <code>LIVE_LIMIT_NEWS</code>.
|
||||
</li>
|
||||
<li>
|
||||
Applied the emergency mitigation directly on the VPS:
|
||||
updated <code>/home/delta/islandflow/.env</code>, created
|
||||
<code>/home/delta/islandflow/.env.backup-2026-05-22-2131</code>, deleted stale
|
||||
<code>live:*</code> Redis keys, rebuilt the web app, and restarted
|
||||
<code>islandflow-api.service</code> and <code>islandflow-web.service</code>.
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Context</h2>
|
||||
<p>
|
||||
The VPS was killing <code>islandflow-api.service</code> several times on May 22, 2026.
|
||||
Kernel logs showed Bun reaching roughly 8-9 GiB RSS inside the API service cgroup before
|
||||
the OOM killer stepped in. The API minute logs also showed channel depths pinned at
|
||||
<code>10000</code> for multiple feeds, plus massive cumulative Redis rewrite churn.
|
||||
</p>
|
||||
<p>
|
||||
Most of the “huge bandwidth” in <code>btop</code> was local loopback traffic: Bun talking
|
||||
to Redis, NATS, and ClickHouse on <code>127.0.0.1</code>. That meant the problem was not a
|
||||
public-edge flood, it was the live cache architecture multiplying internal work on the box.
|
||||
</p>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Important Implementation Details</h2>
|
||||
<div class="two-up">
|
||||
<div>
|
||||
<h3 style="margin-bottom: 10px">API hardening</h3>
|
||||
<ul>
|
||||
<li>
|
||||
Hard caps now bound generic channel windows even if env values drift upward.
|
||||
</li>
|
||||
<li>
|
||||
<code>snapshot_limit</code> is still honored, but only up to the lower of the request,
|
||||
the configured limit, and the safe channel cap.
|
||||
</li>
|
||||
<li>
|
||||
Generic feeds use incremental Redis appends; scoped candle and overlay caches still
|
||||
use full rewrites because they are much smaller and keyed differently.
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div>
|
||||
<h3 style="margin-bottom: 10px">Operational changes</h3>
|
||||
<ul>
|
||||
<li>
|
||||
The VPS now runs with a much smaller hot live footprint:
|
||||
options <code>100</code>, flow <code>500</code>, alerts <code>300</code>,
|
||||
news <code>100</code>.
|
||||
</li>
|
||||
<li>
|
||||
Old Redis hot-cache keys were deleted so the API did not rehydrate oversized lists on boot.
|
||||
</li>
|
||||
<li>
|
||||
The web app was rebuilt on the VPS checkout after switching that checkout onto
|
||||
<code>stabilize-live-api-memory</code>.
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Relevant Diff Snippets</h2>
|
||||
<p class="small">
|
||||
These snippets are rendered with the Diffs library from
|
||||
<a href="https://diffs.com/docs">diffs.com</a>, with a plain-text fallback kept inline in the file.
|
||||
</p>
|
||||
<div class="diff-grid" style="margin-top: 18px">
|
||||
<article class="diff-shell">
|
||||
<h3><code>services/api/src/live.ts</code>: hard caps and append-based generic Redis flushes</h3>
|
||||
<div class="diff-render" id="diff-live"></div>
|
||||
<details>
|
||||
<summary>Plain-text fallback</summary>
|
||||
<pre>Added LIVE_GENERIC_LIMIT_CAPS, clamped env/configured limits, changed generic writes from
|
||||
queueRedisWrite(items:[...items]) to queueGenericRedisWrite(item, items, forceRewrite), and split
|
||||
Redis persistence into rewrite and append paths with shutdown-safe flush serialization.</pre>
|
||||
</details>
|
||||
</article>
|
||||
|
||||
<article class="diff-shell">
|
||||
<h3><code>services/api/src/index.ts</code>: minute metrics now include memory and live subscription visibility</h3>
|
||||
<div class="diff-render" id="diff-index"></div>
|
||||
<details>
|
||||
<summary>Plain-text fallback</summary>
|
||||
<pre>Added buildLiveSubscriptionMetrics(), previous snapshot tracking, flush delta logging,
|
||||
memory snapshots, and gauges for RSS, heap used, active sockets, and per-channel subscriptions.</pre>
|
||||
</details>
|
||||
</article>
|
||||
|
||||
<article class="diff-shell">
|
||||
<h3><code>.env.example</code> and <code>apps/web/app/terminal.tsx</code>: safer default windows</h3>
|
||||
<div class="diff-render" id="diff-config"></div>
|
||||
<details>
|
||||
<summary>Plain-text fallback</summary>
|
||||
<pre>Reduced LIVE_LIMIT_OPTIONS in tracked examples to 100, added LIVE_LIMIT_NEWS=100,
|
||||
and lowered the client-exposed maximum live hot windows from 100000 to 2000.</pre>
|
||||
</details>
|
||||
</article>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Expected Impact for End-Users</h2>
|
||||
<ul>
|
||||
<li>
|
||||
The hosted app should stop disappearing behind API restarts caused by the kernel OOM killer.
|
||||
</li>
|
||||
<li>
|
||||
Live feeds should still feel current, but the server will retain a tighter hot window instead of
|
||||
hoarding oversized in-memory histories.
|
||||
</li>
|
||||
<li>
|
||||
The operator experience on the VPS should improve because internal loopback churn is materially lower.
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Validation</h2>
|
||||
<ul>
|
||||
<li>
|
||||
Local API test gate passed:
|
||||
<code>bun test services/api/tests/live.test.ts</code>
|
||||
</li>
|
||||
<li>
|
||||
Local web production build passed:
|
||||
<code>bun --cwd=apps/web run build</code>
|
||||
</li>
|
||||
<li>
|
||||
VPS mitigation applied successfully. Redis reported <code>1524</code> live keys removed before restart.
|
||||
</li>
|
||||
<li>
|
||||
After mitigation restart, <code>systemctl --user status islandflow-api.service</code> showed the
|
||||
API at about <code>84 MB</code> RSS instead of multi-GB startup drift.
|
||||
</li>
|
||||
<li>
|
||||
After rolling the hardened branch onto the VPS, the API minute log at
|
||||
<code>2026-05-22 21:44:11 EDT</code> showed:
|
||||
</li>
|
||||
</ul>
|
||||
<div class="metrics">
|
||||
<div class="metric">
|
||||
<strong>119.6 MB</strong>
|
||||
<span>API RSS from the minute memory snapshot</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<strong>100</strong>
|
||||
<span><code>live:options</code> depth</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<strong>500</strong>
|
||||
<span><code>live:flow</code>, <code>live:alerts</code>, and <code>live:equity-quotes</code> caps held</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<strong>34,559</strong>
|
||||
<span>Redis flush items in that minute delta</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<strong>9.18 MB</strong>
|
||||
<span>Redis flush payload bytes in that minute delta</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<strong>No new OOM</strong>
|
||||
<span>Kernel logs after the hardened restart</span>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Issues, Limitations, and Mitigations</h2>
|
||||
<ul>
|
||||
<li>
|
||||
The new minute metrics are cumulative plus delta-based. They are much more useful than the old
|
||||
absolute counters, but they still reset on process restart.
|
||||
</li>
|
||||
<li>
|
||||
<code>snapshotItemsByChannel</code> remains empty when no live websocket clients are connected.
|
||||
That is expected because snapshots are only recorded when a snapshot is actually served.
|
||||
</li>
|
||||
<li>
|
||||
Quiet feeds such as news and inferred-dark can still show very old freshness ages in logs.
|
||||
That reflects inactivity, not a broken hot path.
|
||||
</li>
|
||||
<li>
|
||||
The append-based Redis path deliberately falls back to a rewrite when out-of-order live events
|
||||
require the in-memory ordering to be rebuilt. That keeps correctness ahead of theoretical write minimization.
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Follow-up Work</h2>
|
||||
<ul>
|
||||
<li>
|
||||
Add explicit alerting for repeated API RSS growth and for minute-level flush deltas that jump far above the new baseline.
|
||||
</li>
|
||||
<li>
|
||||
Decide whether quiet-channel freshness logs should suppress extremely stale values for feeds like news to reduce operator noise.
|
||||
</li>
|
||||
<li>
|
||||
Consider moving the live cache metrics into a dashboard view so operators do not need to parse journal lines manually.
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<script type="module">
|
||||
const diffs = [
|
||||
{
|
||||
id: "diff-live",
|
||||
name: "services/api/src/live.ts",
|
||||
oldContents: `const DEFAULT_LIVE_LIMITS: GenericLiveLimits = {
|
||||
options: 100,
|
||||
nbbo: 1000,
|
||||
equities: 1000,
|
||||
"equity-quotes": 500,
|
||||
"equity-joins": 500,
|
||||
flow: 500,
|
||||
"smart-money": 300,
|
||||
"classifier-hits": 300,
|
||||
alerts: 300,
|
||||
"inferred-dark": 300,
|
||||
news: 100
|
||||
};
|
||||
|
||||
const parseGenericLimit = (env, channel, fallback) => {
|
||||
const key = GENERIC_LIMIT_ENV_KEYS[channel];
|
||||
const raw = env[key];
|
||||
if (!raw || raw.trim().length === 0) {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
const parsed = Number(raw);
|
||||
const bounded = Math.max(MIN_GENERIC_LIMIT, Math.min(MAX_GENERIC_LIMIT, Math.floor(parsed)));
|
||||
return bounded;
|
||||
};
|
||||
|
||||
type BufferedRedisWrite = {
|
||||
listKey: string;
|
||||
cursorField: string;
|
||||
items: unknown[];
|
||||
limit: number;
|
||||
cursor: Cursor | null;
|
||||
updates: number;
|
||||
};
|
||||
|
||||
private queueRedisWrite(listKey, cursorField, items, limit, cursor) {
|
||||
const existing = this.pendingRedisWrites.get(listKey);
|
||||
const write: BufferedRedisWrite = {
|
||||
listKey,
|
||||
cursorField,
|
||||
items: [...items],
|
||||
limit,
|
||||
cursor,
|
||||
updates: (existing?.updates ?? 0) + 1
|
||||
};
|
||||
this.pendingRedisWrites.set(listKey, write);
|
||||
}
|
||||
|
||||
private async persistList(listKey, cursorField, items, limit, cursor) {
|
||||
const payloads = items.map((entry) => JSON.stringify(entry));
|
||||
await this.redis.lTrim(listKey, 1, 0);
|
||||
if (payloads.length > 0) {
|
||||
for (let idx = payloads.length - 1; idx >= 0; idx -= 1) {
|
||||
await this.redis.lPush(listKey, payloads[idx]);
|
||||
}
|
||||
await this.redis.lTrim(listKey, 0, limit - 1);
|
||||
}
|
||||
await this.redis.hSet(CURSOR_HASH_KEY, cursorField, JSON.stringify(cursor));
|
||||
}`,
|
||||
newContents: `export const LIVE_GENERIC_LIMIT_CAPS: GenericLiveLimits = {
|
||||
options: 100,
|
||||
nbbo: 1000,
|
||||
equities: 1000,
|
||||
"equity-quotes": 500,
|
||||
"equity-joins": 500,
|
||||
flow: 500,
|
||||
"smart-money": 300,
|
||||
"classifier-hits": 300,
|
||||
alerts: 300,
|
||||
"inferred-dark": 300,
|
||||
news: 100
|
||||
};
|
||||
|
||||
const clampConfiguredLimit = (channel: LiveGenericChannel, value: number): number =>
|
||||
Math.max(MIN_GENERIC_LIMIT, Math.min(LIVE_GENERIC_LIMIT_CAPS[channel], Math.floor(value)));
|
||||
|
||||
const parseGenericLimit = (env, channel, fallback) => {
|
||||
const key = GENERIC_LIMIT_ENV_KEYS[channel];
|
||||
const raw = env[key];
|
||||
if (!raw || raw.trim().length === 0) {
|
||||
return clampConfiguredLimit(channel, fallback);
|
||||
}
|
||||
|
||||
const parsed = Number(raw);
|
||||
const bounded = clampConfiguredLimit(channel, Math.min(MAX_GENERIC_LIMIT, parsed));
|
||||
return bounded;
|
||||
};
|
||||
|
||||
type BufferedRedisRewrite = {
|
||||
mode: "rewrite";
|
||||
listKey: string;
|
||||
cursorField: string;
|
||||
items: unknown[];
|
||||
limit: number;
|
||||
cursor: Cursor | null;
|
||||
updates: number;
|
||||
};
|
||||
|
||||
type BufferedRedisAppend = {
|
||||
mode: "append";
|
||||
listKey: string;
|
||||
cursorField: string;
|
||||
payloads: string[];
|
||||
limit: number;
|
||||
cursor: Cursor | null;
|
||||
updates: number;
|
||||
};
|
||||
|
||||
private queueGenericRedisWrite(listKey, cursorField, item, items, limit, cursor, forceRewrite = false) {
|
||||
const existing = this.pendingRedisWrites.get(listKey);
|
||||
const nextUpdateCount = (existing?.updates ?? 0) + 1;
|
||||
if (forceRewrite || existing?.mode === "rewrite") {
|
||||
this.pendingRedisWrites.set(listKey, {
|
||||
mode: "rewrite",
|
||||
listKey,
|
||||
cursorField,
|
||||
items: [...items],
|
||||
limit,
|
||||
cursor,
|
||||
updates: nextUpdateCount
|
||||
});
|
||||
} else {
|
||||
this.pendingRedisWrites.set(listKey, {
|
||||
mode: "append",
|
||||
listKey,
|
||||
cursorField,
|
||||
payloads: [...(existing?.mode === "append" ? existing.payloads : []), JSON.stringify(item)],
|
||||
limit,
|
||||
cursor,
|
||||
updates: nextUpdateCount
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private async persistListAppend(listKey, cursorField, payloads, limit, cursor) {
|
||||
for (const payload of payloads) {
|
||||
await this.redis.lPush(listKey, payload);
|
||||
}
|
||||
await this.redis.lTrim(listKey, 0, limit - 1);
|
||||
await this.redis.hSet(CURSOR_HASH_KEY, cursorField, JSON.stringify(cursor));
|
||||
}`
|
||||
},
|
||||
{
|
||||
id: "diff-index",
|
||||
name: "services/api/src/index.ts",
|
||||
oldContents: `const liveStateMetricsTimer = setInterval(() => {
|
||||
const snapshot = liveState.getStatsSnapshot();
|
||||
const hotFeedHealth = liveState.getHotChannelHealth();
|
||||
const hotFeedLagMs = {
|
||||
options: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.options] ?? null,
|
||||
equities: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.equities] ?? null,
|
||||
flow: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.flow] ?? null,
|
||||
nbbo: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.nbbo] ?? null
|
||||
};
|
||||
logger.info("live cache metrics", {
|
||||
...snapshot,
|
||||
hotFeedLagMs,
|
||||
hotFeedHealth,
|
||||
snapshotSourceCounts: {
|
||||
generic_cache_snapshot: snapshot.genericCacheSnapshots,
|
||||
scoped_clickhouse_snapshot: snapshot.scopedClickHouseSnapshots
|
||||
}
|
||||
});
|
||||
}, 60000);`,
|
||||
newContents: `const buildLiveSubscriptionMetrics = () => {
|
||||
const uniqueSubscriptionsByChannel: Partial<Record<LiveSubscription["channel"], number>> = {};
|
||||
const socketFanoutByChannel: Partial<Record<LiveSubscription["channel"], number>> = {};
|
||||
|
||||
for (const subscription of subscriptionDefinitions.values()) {
|
||||
uniqueSubscriptionsByChannel[subscription.channel] =
|
||||
(uniqueSubscriptionsByChannel[subscription.channel] ?? 0) + 1;
|
||||
}
|
||||
|
||||
for (const [key, sockets] of subscriptionSockets.entries()) {
|
||||
const subscription = subscriptionDefinitions.get(key);
|
||||
if (!subscription || sockets.size === 0) {
|
||||
continue;
|
||||
}
|
||||
socketFanoutByChannel[subscription.channel] =
|
||||
(socketFanoutByChannel[subscription.channel] ?? 0) + sockets.size;
|
||||
}
|
||||
|
||||
return {
|
||||
liveSocketCount: liveSocketSubscriptions.size,
|
||||
uniqueSubscriptionsByChannel,
|
||||
socketFanoutByChannel
|
||||
};
|
||||
};
|
||||
|
||||
let previousLiveStats = liveState.getStatsSnapshot();
|
||||
let previousMemoryUsage = process.memoryUsage();
|
||||
|
||||
const liveStateMetricsTimer = setInterval(() => {
|
||||
const snapshot = liveState.getStatsSnapshot();
|
||||
const hotFeedHealth = liveState.getHotChannelHealth();
|
||||
const subscriptionMetrics = buildLiveSubscriptionMetrics();
|
||||
const memoryUsage = process.memoryUsage();
|
||||
const flushDelta = {
|
||||
redisFlushCount: snapshot.redisFlushCount - previousLiveStats.redisFlushCount,
|
||||
redisFlushItems: snapshot.redisFlushItems - previousLiveStats.redisFlushItems,
|
||||
redisFlushPayloadBytes: snapshot.redisFlushPayloadBytes - previousLiveStats.redisFlushPayloadBytes
|
||||
};
|
||||
const memorySnapshot = {
|
||||
rss_bytes: memoryUsage.rss,
|
||||
heap_used_bytes: memoryUsage.heapUsed,
|
||||
rss_delta_bytes: memoryUsage.rss - previousMemoryUsage.rss
|
||||
};
|
||||
|
||||
logger.info("live cache metrics", {
|
||||
...snapshot,
|
||||
flushDelta,
|
||||
memorySnapshot,
|
||||
liveSubscriptions: subscriptionMetrics
|
||||
});
|
||||
|
||||
metrics.gauge("api.memory.rss_bytes", memoryUsage.rss);
|
||||
metrics.gauge("api.live.active_sockets", subscriptionMetrics.liveSocketCount);
|
||||
}, 60000);`
|
||||
},
|
||||
{
|
||||
id: "diff-config",
|
||||
name: "config excerpt",
|
||||
oldContents: `// apps/web/app/terminal.tsx
|
||||
const LIVE_HOT_WINDOW = parseBoundedInt(process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW, 600, 1, 100000);
|
||||
const LIVE_HOT_WINDOW_OPTIONS = parseBoundedInt(
|
||||
process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW_OPTIONS,
|
||||
1200,
|
||||
1,
|
||||
100000
|
||||
);
|
||||
|
||||
# .env.example
|
||||
LIVE_LIMIT_OPTIONS=1000
|
||||
LIVE_LIMIT_INFERRED_DARK=300`,
|
||||
newContents: `// apps/web/app/terminal.tsx
|
||||
const LIVE_HOT_WINDOW = parseBoundedInt(process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW, 600, 1, 2000);
|
||||
const LIVE_HOT_WINDOW_OPTIONS = parseBoundedInt(
|
||||
process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW_OPTIONS,
|
||||
1200,
|
||||
1,
|
||||
2000
|
||||
);
|
||||
|
||||
# .env.example
|
||||
LIVE_LIMIT_OPTIONS=100
|
||||
LIVE_LIMIT_INFERRED_DARK=300
|
||||
LIVE_LIMIT_NEWS=100`
|
||||
}
|
||||
];
|
||||
|
||||
try {
|
||||
const { FileDiff } = await import("https://esm.sh/@pierre/diffs");
|
||||
for (const diff of diffs) {
|
||||
const container = document.getElementById(diff.id);
|
||||
if (!container) continue;
|
||||
const fileDiff = new FileDiff({ theme: "github-light" });
|
||||
fileDiff.render({
|
||||
oldFile: { name: diff.name, contents: diff.oldContents },
|
||||
newFile: { name: diff.name, contents: diff.newContents },
|
||||
containerWrapper: container
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn("Failed to load diffs.com renderer", error);
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Loading…
Add table
Add a link
Reference in a new issue