document live api stabilization rollout
This commit is contained in:
parent
20397fdef3
commit
5a68a3e38e
2 changed files with 811 additions and 1 deletions
810
docs/turns/2026-05-22-stabilize-live-api-memory.html
Normal file
810
docs/turns/2026-05-22-stabilize-live-api-memory.html
Normal file
|
|
@ -0,0 +1,810 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>Turn Record: Stabilize Live API Memory</title>
|
||||
<style>
|
||||
:root {
|
||||
color-scheme: light;
|
||||
--bg: #f3f1eb;
|
||||
--surface: #fffdf8;
|
||||
--surface-strong: #f7f2e9;
|
||||
--ink: #1f1b16;
|
||||
--muted: #62584c;
|
||||
--line: #d6c8b4;
|
||||
--accent: #8d5a2b;
|
||||
--accent-soft: rgba(141, 90, 43, 0.12);
|
||||
--good: #245c3b;
|
||||
--warn: #8a4b15;
|
||||
--shadow: 0 24px 60px rgba(61, 44, 21, 0.08);
|
||||
}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
margin: 0;
|
||||
background:
|
||||
radial-gradient(circle at top left, rgba(141, 90, 43, 0.12), transparent 32%),
|
||||
linear-gradient(180deg, #f7f3ec 0%, var(--bg) 100%);
|
||||
color: var(--ink);
|
||||
font: 16px/1.6 "Iowan Old Style", "Palatino Linotype", "Book Antiqua", Palatino, serif;
|
||||
}
|
||||
|
||||
main {
|
||||
width: min(1120px, calc(100vw - 40px));
|
||||
margin: 32px auto 56px;
|
||||
}
|
||||
|
||||
.hero {
|
||||
background: var(--surface);
|
||||
border: 1px solid rgba(214, 200, 180, 0.9);
|
||||
border-radius: 28px;
|
||||
padding: 32px;
|
||||
box-shadow: var(--shadow);
|
||||
}
|
||||
|
||||
.eyebrow {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
padding: 8px 12px;
|
||||
border-radius: 999px;
|
||||
background: var(--accent-soft);
|
||||
color: var(--accent);
|
||||
font: 600 12px/1.2 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
letter-spacing: 0.08em;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
|
||||
h1,
|
||||
h2,
|
||||
h3 {
|
||||
margin: 0;
|
||||
font-family: "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
line-height: 1.1;
|
||||
}
|
||||
|
||||
h1 {
|
||||
margin-top: 16px;
|
||||
font-size: clamp(2.4rem, 5vw, 4rem);
|
||||
letter-spacing: -0.04em;
|
||||
}
|
||||
|
||||
.lede {
|
||||
max-width: 72ch;
|
||||
margin-top: 18px;
|
||||
color: var(--muted);
|
||||
font-size: 1.08rem;
|
||||
}
|
||||
|
||||
.hero-meta {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
|
||||
gap: 14px;
|
||||
margin-top: 28px;
|
||||
}
|
||||
|
||||
.meta-card {
|
||||
padding: 16px 18px;
|
||||
border-radius: 18px;
|
||||
background: var(--surface-strong);
|
||||
border: 1px solid rgba(214, 200, 180, 0.9);
|
||||
}
|
||||
|
||||
.meta-label {
|
||||
display: block;
|
||||
color: var(--muted);
|
||||
font: 600 0.76rem/1.2 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
letter-spacing: 0.06em;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
|
||||
.meta-value {
|
||||
display: block;
|
||||
margin-top: 8px;
|
||||
font: 700 1.05rem/1.3 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
.grid {
|
||||
display: grid;
|
||||
gap: 20px;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
.card {
|
||||
background: var(--surface);
|
||||
border: 1px solid rgba(214, 200, 180, 0.9);
|
||||
border-radius: 24px;
|
||||
padding: 24px;
|
||||
box-shadow: var(--shadow);
|
||||
}
|
||||
|
||||
.card h2 {
|
||||
font-size: 1.35rem;
|
||||
margin-bottom: 14px;
|
||||
}
|
||||
|
||||
p {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
p + p,
|
||||
ul + p,
|
||||
p + ul,
|
||||
ul + ul {
|
||||
margin-top: 12px;
|
||||
}
|
||||
|
||||
ul {
|
||||
margin: 0;
|
||||
padding-left: 20px;
|
||||
}
|
||||
|
||||
li + li {
|
||||
margin-top: 8px;
|
||||
}
|
||||
|
||||
.callout {
|
||||
padding: 16px 18px;
|
||||
border-radius: 18px;
|
||||
background: rgba(36, 92, 59, 0.08);
|
||||
border: 1px solid rgba(36, 92, 59, 0.16);
|
||||
}
|
||||
|
||||
.callout strong {
|
||||
color: var(--good);
|
||||
font-family: "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
.metrics {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
||||
gap: 12px;
|
||||
margin-top: 16px;
|
||||
}
|
||||
|
||||
.metric {
|
||||
padding: 14px 16px;
|
||||
border-radius: 18px;
|
||||
background: var(--surface-strong);
|
||||
border: 1px solid rgba(214, 200, 180, 0.9);
|
||||
}
|
||||
|
||||
.metric strong {
|
||||
display: block;
|
||||
font: 700 1.2rem/1.2 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
.metric span {
|
||||
color: var(--muted);
|
||||
font: 500 0.9rem/1.4 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
code,
|
||||
pre {
|
||||
font-family: "IBM Plex Mono", "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
|
||||
}
|
||||
|
||||
code {
|
||||
padding: 0.08rem 0.35rem;
|
||||
border-radius: 0.45rem;
|
||||
background: rgba(99, 86, 67, 0.08);
|
||||
}
|
||||
|
||||
pre {
|
||||
overflow-x: auto;
|
||||
margin: 12px 0 0;
|
||||
padding: 16px;
|
||||
border-radius: 16px;
|
||||
background: #1f1b16;
|
||||
color: #f7f2e9;
|
||||
font-size: 0.88rem;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.diff-grid {
|
||||
display: grid;
|
||||
gap: 18px;
|
||||
}
|
||||
|
||||
.diff-shell {
|
||||
border: 1px solid rgba(214, 200, 180, 0.9);
|
||||
border-radius: 20px;
|
||||
padding: 16px;
|
||||
background: linear-gradient(180deg, #fffdf9 0%, #f7f2ea 100%);
|
||||
}
|
||||
|
||||
.diff-shell h3 {
|
||||
font-size: 1rem;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.diff-render {
|
||||
min-height: 120px;
|
||||
}
|
||||
|
||||
details {
|
||||
margin-top: 12px;
|
||||
}
|
||||
|
||||
summary {
|
||||
cursor: pointer;
|
||||
color: var(--accent);
|
||||
font: 600 0.88rem/1.3 "IBM Plex Sans", "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
.two-up {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
|
||||
gap: 20px;
|
||||
}
|
||||
|
||||
.small {
|
||||
color: var(--muted);
|
||||
font-size: 0.92rem;
|
||||
}
|
||||
|
||||
a {
|
||||
color: var(--accent);
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<main>
|
||||
<section class="hero">
|
||||
<span class="eyebrow">Turn Record · May 22, 2026</span>
|
||||
<h1>Stabilize Live API Memory and Internal Traffic</h1>
|
||||
<p class="lede">
|
||||
The Islandflow live API was repeatedly getting OOM-killed on the VPS because the hot live
|
||||
cache could retain oversized channel windows and rewrite whole Redis lists at high
|
||||
frequency. This turn applied an immediate server-side mitigation, hardened the API cache
|
||||
path in code, and rolled the changes onto the native systemd deployment.
|
||||
</p>
|
||||
<div class="hero-meta">
|
||||
<div class="meta-card">
|
||||
<span class="meta-label">Branch</span>
|
||||
<span class="meta-value"><code>stabilize-live-api-memory</code></span>
|
||||
</div>
|
||||
<div class="meta-card">
|
||||
<span class="meta-label">Beads</span>
|
||||
<span class="meta-value"><code>islandflow-thp</code></span>
|
||||
</div>
|
||||
<div class="meta-card">
|
||||
<span class="meta-label">Deployment</span>
|
||||
<span class="meta-value">Native systemd user services on the VPS</span>
|
||||
</div>
|
||||
<div class="meta-card">
|
||||
<span class="meta-label">Primary Outcome</span>
|
||||
<span class="meta-value">API RSS returned to roughly 115-130 MB after rollout</span>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<div class="grid">
|
||||
<section class="card">
|
||||
<h2>Summary</h2>
|
||||
<p>
|
||||
The live API is now bounded in three layers instead of trusting environment values and
|
||||
reconnect behavior. First, the VPS <code>.env</code> was reset to safer live-window
|
||||
values and the oversized Redis hot-cache keys were cleared. Second, the API now clamps
|
||||
generic live cache limits per channel in code. Third, generic live feed persistence now
|
||||
appends deltas into Redis instead of cloning and rewriting entire lists on every flush.
|
||||
</p>
|
||||
<div class="callout" style="margin-top: 16px">
|
||||
<strong>Observed on the VPS after rollout:</strong>
|
||||
the API stayed healthy through restart, minute metrics showed much smaller cache depths,
|
||||
and the kernel did not log any new Bun OOM kill after the hardened restart.
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Changes Made</h2>
|
||||
<ul>
|
||||
<li>
|
||||
Added channel-specific hard caps in
|
||||
<code>services/api/src/live.ts</code> so oversized
|
||||
<code>LIVE_LIMIT_*</code> values are clamped before use.
|
||||
</li>
|
||||
<li>
|
||||
Changed generic live Redis persistence from full-list rewrite behavior to append-plus-trim,
|
||||
with rewrite fallback only when the in-memory ordering has to be rebuilt.
|
||||
</li>
|
||||
<li>
|
||||
Serialized Redis flushes during shutdown so service restarts do not race with a closing
|
||||
Redis client.
|
||||
</li>
|
||||
<li>
|
||||
Added API minute-log visibility for live subscription counts, Redis flush deltas,
|
||||
payload bytes, snapshot sizes, and process memory usage.
|
||||
</li>
|
||||
<li>
|
||||
Tightened the browser-exposed live window caps in
|
||||
<code>apps/web/app/terminal.tsx</code> and aligned the tracked env examples with the safer
|
||||
production defaults, including <code>LIVE_LIMIT_NEWS</code>.
|
||||
</li>
|
||||
<li>
|
||||
Applied the emergency mitigation directly on the VPS:
|
||||
updated <code>/home/delta/islandflow/.env</code>, created
|
||||
<code>/home/delta/islandflow/.env.backup-2026-05-22-2131</code>, deleted stale
|
||||
<code>live:*</code> Redis keys, rebuilt the web app, and restarted
|
||||
<code>islandflow-api.service</code> and <code>islandflow-web.service</code>.
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Context</h2>
|
||||
<p>
|
||||
The VPS was killing <code>islandflow-api.service</code> several times on May 22, 2026.
|
||||
Kernel logs showed Bun reaching roughly 8-9 GiB RSS inside the API service cgroup before
|
||||
the OOM killer stepped in. The API minute logs also showed channel depths pinned at
|
||||
<code>10000</code> for multiple feeds, plus massive cumulative Redis rewrite churn.
|
||||
</p>
|
||||
<p>
|
||||
Most of the “huge bandwidth” in <code>btop</code> was local loopback traffic: Bun talking
|
||||
to Redis, NATS, and ClickHouse on <code>127.0.0.1</code>. That meant the problem was not a
|
||||
public-edge flood, it was the live cache architecture multiplying internal work on the box.
|
||||
</p>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Important Implementation Details</h2>
|
||||
<div class="two-up">
|
||||
<div>
|
||||
<h3 style="margin-bottom: 10px">API hardening</h3>
|
||||
<ul>
|
||||
<li>
|
||||
Hard caps now bound generic channel windows even if env values drift upward.
|
||||
</li>
|
||||
<li>
|
||||
<code>snapshot_limit</code> is still honored, but only up to the lower of the request,
|
||||
the configured limit, and the safe channel cap.
|
||||
</li>
|
||||
<li>
|
||||
Generic feeds use incremental Redis appends; scoped candle and overlay caches still
|
||||
use full rewrites because they are much smaller and keyed differently.
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div>
|
||||
<h3 style="margin-bottom: 10px">Operational changes</h3>
|
||||
<ul>
|
||||
<li>
|
||||
The VPS now runs with a much smaller hot live footprint:
|
||||
options <code>100</code>, flow <code>500</code>, alerts <code>300</code>,
|
||||
news <code>100</code>.
|
||||
</li>
|
||||
<li>
|
||||
Old Redis hot-cache keys were deleted so the API did not rehydrate oversized lists on boot.
|
||||
</li>
|
||||
<li>
|
||||
The web app was rebuilt on the VPS checkout after switching that checkout onto
|
||||
<code>stabilize-live-api-memory</code>.
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Relevant Diff Snippets</h2>
|
||||
<p class="small">
|
||||
These snippets are rendered with the Diffs library from
|
||||
<a href="https://diffs.com/docs">diffs.com</a>, with a plain-text fallback kept inline in the file.
|
||||
</p>
|
||||
<div class="diff-grid" style="margin-top: 18px">
|
||||
<article class="diff-shell">
|
||||
<h3><code>services/api/src/live.ts</code>: hard caps and append-based generic Redis flushes</h3>
|
||||
<div class="diff-render" id="diff-live"></div>
|
||||
<details>
|
||||
<summary>Plain-text fallback</summary>
|
||||
<pre>Added LIVE_GENERIC_LIMIT_CAPS, clamped env/configured limits, changed generic writes from
|
||||
queueRedisWrite(items:[...items]) to queueGenericRedisWrite(item, items, forceRewrite), and split
|
||||
Redis persistence into rewrite and append paths with shutdown-safe flush serialization.</pre>
|
||||
</details>
|
||||
</article>
|
||||
|
||||
<article class="diff-shell">
|
||||
<h3><code>services/api/src/index.ts</code>: minute metrics now include memory and live subscription visibility</h3>
|
||||
<div class="diff-render" id="diff-index"></div>
|
||||
<details>
|
||||
<summary>Plain-text fallback</summary>
|
||||
<pre>Added buildLiveSubscriptionMetrics(), previous snapshot tracking, flush delta logging,
|
||||
memory snapshots, and gauges for RSS, heap used, active sockets, and per-channel subscriptions.</pre>
|
||||
</details>
|
||||
</article>
|
||||
|
||||
<article class="diff-shell">
|
||||
<h3><code>.env.example</code> and <code>apps/web/app/terminal.tsx</code>: safer default windows</h3>
|
||||
<div class="diff-render" id="diff-config"></div>
|
||||
<details>
|
||||
<summary>Plain-text fallback</summary>
|
||||
<pre>Reduced LIVE_LIMIT_OPTIONS in tracked examples to 100, added LIVE_LIMIT_NEWS=100,
|
||||
and lowered the client-exposed maximum live hot windows from 100000 to 2000.</pre>
|
||||
</details>
|
||||
</article>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Expected Impact for End-Users</h2>
|
||||
<ul>
|
||||
<li>
|
||||
The hosted app should stop disappearing behind API restarts caused by the kernel OOM killer.
|
||||
</li>
|
||||
<li>
|
||||
Live feeds should still feel current, but the server will retain a tighter hot window instead of
|
||||
hoarding oversized in-memory histories.
|
||||
</li>
|
||||
<li>
|
||||
The operator experience on the VPS should improve because internal loopback churn is materially lower.
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Validation</h2>
|
||||
<ul>
|
||||
<li>
|
||||
Local API test gate passed:
|
||||
<code>bun test services/api/tests/live.test.ts</code>
|
||||
</li>
|
||||
<li>
|
||||
Local web production build passed:
|
||||
<code>bun --cwd=apps/web run build</code>
|
||||
</li>
|
||||
<li>
|
||||
VPS mitigation applied successfully. Redis reported <code>1524</code> live keys removed before restart.
|
||||
</li>
|
||||
<li>
|
||||
After mitigation restart, <code>systemctl --user status islandflow-api.service</code> showed the
|
||||
API at about <code>84 MB</code> RSS instead of multi-GB startup drift.
|
||||
</li>
|
||||
<li>
|
||||
After rolling the hardened branch onto the VPS, the API minute log at
|
||||
<code>2026-05-22 21:44:11 EDT</code> showed:
|
||||
</li>
|
||||
</ul>
|
||||
<div class="metrics">
|
||||
<div class="metric">
|
||||
<strong>119.6 MB</strong>
|
||||
<span>API RSS from the minute memory snapshot</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<strong>100</strong>
|
||||
<span><code>live:options</code> depth</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<strong>500</strong>
|
||||
<span><code>live:flow</code>, <code>live:alerts</code>, and <code>live:equity-quotes</code> caps held</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<strong>34,559</strong>
|
||||
<span>Redis flush items in that minute delta</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<strong>9.18 MB</strong>
|
||||
<span>Redis flush payload bytes in that minute delta</span>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<strong>No new OOM</strong>
|
||||
<span>Kernel logs after the hardened restart</span>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Issues, Limitations, and Mitigations</h2>
|
||||
<ul>
|
||||
<li>
|
||||
The new minute metrics are cumulative plus delta-based. They are much more useful than the old
|
||||
absolute counters, but they still reset on process restart.
|
||||
</li>
|
||||
<li>
|
||||
<code>snapshotItemsByChannel</code> remains empty when no live websocket clients are connected.
|
||||
That is expected because snapshots are only recorded when a snapshot is actually served.
|
||||
</li>
|
||||
<li>
|
||||
Quiet feeds such as news and inferred-dark can still show very old freshness ages in logs.
|
||||
That reflects inactivity, not a broken hot path.
|
||||
</li>
|
||||
<li>
|
||||
The append-based Redis path deliberately falls back to a rewrite when out-of-order live events
|
||||
require the in-memory ordering to be rebuilt. That keeps correctness ahead of theoretical write minimization.
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Follow-up Work</h2>
|
||||
<ul>
|
||||
<li>
|
||||
Add explicit alerting for repeated API RSS growth and for minute-level flush deltas that jump far above the new baseline.
|
||||
</li>
|
||||
<li>
|
||||
Decide whether quiet-channel freshness logs should suppress extremely stale values for feeds like news to reduce operator noise.
|
||||
</li>
|
||||
<li>
|
||||
Consider moving the live cache metrics into a dashboard view so operators do not need to parse journal lines manually.
|
||||
</li>
|
||||
</ul>
|
||||
</section>
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<script type="module">
|
||||
const diffs = [
|
||||
{
|
||||
id: "diff-live",
|
||||
name: "services/api/src/live.ts",
|
||||
oldContents: `const DEFAULT_LIVE_LIMITS: GenericLiveLimits = {
|
||||
options: 100,
|
||||
nbbo: 1000,
|
||||
equities: 1000,
|
||||
"equity-quotes": 500,
|
||||
"equity-joins": 500,
|
||||
flow: 500,
|
||||
"smart-money": 300,
|
||||
"classifier-hits": 300,
|
||||
alerts: 300,
|
||||
"inferred-dark": 300,
|
||||
news: 100
|
||||
};
|
||||
|
||||
const parseGenericLimit = (env, channel, fallback) => {
|
||||
const key = GENERIC_LIMIT_ENV_KEYS[channel];
|
||||
const raw = env[key];
|
||||
if (!raw || raw.trim().length === 0) {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
const parsed = Number(raw);
|
||||
const bounded = Math.max(MIN_GENERIC_LIMIT, Math.min(MAX_GENERIC_LIMIT, Math.floor(parsed)));
|
||||
return bounded;
|
||||
};
|
||||
|
||||
type BufferedRedisWrite = {
|
||||
listKey: string;
|
||||
cursorField: string;
|
||||
items: unknown[];
|
||||
limit: number;
|
||||
cursor: Cursor | null;
|
||||
updates: number;
|
||||
};
|
||||
|
||||
private queueRedisWrite(listKey, cursorField, items, limit, cursor) {
|
||||
const existing = this.pendingRedisWrites.get(listKey);
|
||||
const write: BufferedRedisWrite = {
|
||||
listKey,
|
||||
cursorField,
|
||||
items: [...items],
|
||||
limit,
|
||||
cursor,
|
||||
updates: (existing?.updates ?? 0) + 1
|
||||
};
|
||||
this.pendingRedisWrites.set(listKey, write);
|
||||
}
|
||||
|
||||
private async persistList(listKey, cursorField, items, limit, cursor) {
|
||||
const payloads = items.map((entry) => JSON.stringify(entry));
|
||||
await this.redis.lTrim(listKey, 1, 0);
|
||||
if (payloads.length > 0) {
|
||||
for (let idx = payloads.length - 1; idx >= 0; idx -= 1) {
|
||||
await this.redis.lPush(listKey, payloads[idx]);
|
||||
}
|
||||
await this.redis.lTrim(listKey, 0, limit - 1);
|
||||
}
|
||||
await this.redis.hSet(CURSOR_HASH_KEY, cursorField, JSON.stringify(cursor));
|
||||
}`,
|
||||
newContents: `export const LIVE_GENERIC_LIMIT_CAPS: GenericLiveLimits = {
|
||||
options: 100,
|
||||
nbbo: 1000,
|
||||
equities: 1000,
|
||||
"equity-quotes": 500,
|
||||
"equity-joins": 500,
|
||||
flow: 500,
|
||||
"smart-money": 300,
|
||||
"classifier-hits": 300,
|
||||
alerts: 300,
|
||||
"inferred-dark": 300,
|
||||
news: 100
|
||||
};
|
||||
|
||||
const clampConfiguredLimit = (channel: LiveGenericChannel, value: number): number =>
|
||||
Math.max(MIN_GENERIC_LIMIT, Math.min(LIVE_GENERIC_LIMIT_CAPS[channel], Math.floor(value)));
|
||||
|
||||
const parseGenericLimit = (env, channel, fallback) => {
|
||||
const key = GENERIC_LIMIT_ENV_KEYS[channel];
|
||||
const raw = env[key];
|
||||
if (!raw || raw.trim().length === 0) {
|
||||
return clampConfiguredLimit(channel, fallback);
|
||||
}
|
||||
|
||||
const parsed = Number(raw);
|
||||
const bounded = clampConfiguredLimit(channel, Math.min(MAX_GENERIC_LIMIT, parsed));
|
||||
return bounded;
|
||||
};
|
||||
|
||||
type BufferedRedisRewrite = {
|
||||
mode: "rewrite";
|
||||
listKey: string;
|
||||
cursorField: string;
|
||||
items: unknown[];
|
||||
limit: number;
|
||||
cursor: Cursor | null;
|
||||
updates: number;
|
||||
};
|
||||
|
||||
type BufferedRedisAppend = {
|
||||
mode: "append";
|
||||
listKey: string;
|
||||
cursorField: string;
|
||||
payloads: string[];
|
||||
limit: number;
|
||||
cursor: Cursor | null;
|
||||
updates: number;
|
||||
};
|
||||
|
||||
private queueGenericRedisWrite(listKey, cursorField, item, items, limit, cursor, forceRewrite = false) {
|
||||
const existing = this.pendingRedisWrites.get(listKey);
|
||||
const nextUpdateCount = (existing?.updates ?? 0) + 1;
|
||||
if (forceRewrite || existing?.mode === "rewrite") {
|
||||
this.pendingRedisWrites.set(listKey, {
|
||||
mode: "rewrite",
|
||||
listKey,
|
||||
cursorField,
|
||||
items: [...items],
|
||||
limit,
|
||||
cursor,
|
||||
updates: nextUpdateCount
|
||||
});
|
||||
} else {
|
||||
this.pendingRedisWrites.set(listKey, {
|
||||
mode: "append",
|
||||
listKey,
|
||||
cursorField,
|
||||
payloads: [...(existing?.mode === "append" ? existing.payloads : []), JSON.stringify(item)],
|
||||
limit,
|
||||
cursor,
|
||||
updates: nextUpdateCount
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private async persistListAppend(listKey, cursorField, payloads, limit, cursor) {
|
||||
for (const payload of payloads) {
|
||||
await this.redis.lPush(listKey, payload);
|
||||
}
|
||||
await this.redis.lTrim(listKey, 0, limit - 1);
|
||||
await this.redis.hSet(CURSOR_HASH_KEY, cursorField, JSON.stringify(cursor));
|
||||
}`
|
||||
},
|
||||
{
|
||||
id: "diff-index",
|
||||
name: "services/api/src/index.ts",
|
||||
oldContents: `const liveStateMetricsTimer = setInterval(() => {
|
||||
const snapshot = liveState.getStatsSnapshot();
|
||||
const hotFeedHealth = liveState.getHotChannelHealth();
|
||||
const hotFeedLagMs = {
|
||||
options: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.options] ?? null,
|
||||
equities: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.equities] ?? null,
|
||||
flow: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.flow] ?? null,
|
||||
nbbo: snapshot.freshnessAgeMsByKey[HOT_LIVE_REDIS_KEYS.nbbo] ?? null
|
||||
};
|
||||
logger.info("live cache metrics", {
|
||||
...snapshot,
|
||||
hotFeedLagMs,
|
||||
hotFeedHealth,
|
||||
snapshotSourceCounts: {
|
||||
generic_cache_snapshot: snapshot.genericCacheSnapshots,
|
||||
scoped_clickhouse_snapshot: snapshot.scopedClickHouseSnapshots
|
||||
}
|
||||
});
|
||||
}, 60000);`,
|
||||
newContents: `const buildLiveSubscriptionMetrics = () => {
|
||||
const uniqueSubscriptionsByChannel: Partial<Record<LiveSubscription["channel"], number>> = {};
|
||||
const socketFanoutByChannel: Partial<Record<LiveSubscription["channel"], number>> = {};
|
||||
|
||||
for (const subscription of subscriptionDefinitions.values()) {
|
||||
uniqueSubscriptionsByChannel[subscription.channel] =
|
||||
(uniqueSubscriptionsByChannel[subscription.channel] ?? 0) + 1;
|
||||
}
|
||||
|
||||
for (const [key, sockets] of subscriptionSockets.entries()) {
|
||||
const subscription = subscriptionDefinitions.get(key);
|
||||
if (!subscription || sockets.size === 0) {
|
||||
continue;
|
||||
}
|
||||
socketFanoutByChannel[subscription.channel] =
|
||||
(socketFanoutByChannel[subscription.channel] ?? 0) + sockets.size;
|
||||
}
|
||||
|
||||
return {
|
||||
liveSocketCount: liveSocketSubscriptions.size,
|
||||
uniqueSubscriptionsByChannel,
|
||||
socketFanoutByChannel
|
||||
};
|
||||
};
|
||||
|
||||
let previousLiveStats = liveState.getStatsSnapshot();
|
||||
let previousMemoryUsage = process.memoryUsage();
|
||||
|
||||
const liveStateMetricsTimer = setInterval(() => {
|
||||
const snapshot = liveState.getStatsSnapshot();
|
||||
const hotFeedHealth = liveState.getHotChannelHealth();
|
||||
const subscriptionMetrics = buildLiveSubscriptionMetrics();
|
||||
const memoryUsage = process.memoryUsage();
|
||||
const flushDelta = {
|
||||
redisFlushCount: snapshot.redisFlushCount - previousLiveStats.redisFlushCount,
|
||||
redisFlushItems: snapshot.redisFlushItems - previousLiveStats.redisFlushItems,
|
||||
redisFlushPayloadBytes: snapshot.redisFlushPayloadBytes - previousLiveStats.redisFlushPayloadBytes
|
||||
};
|
||||
const memorySnapshot = {
|
||||
rss_bytes: memoryUsage.rss,
|
||||
heap_used_bytes: memoryUsage.heapUsed,
|
||||
rss_delta_bytes: memoryUsage.rss - previousMemoryUsage.rss
|
||||
};
|
||||
|
||||
logger.info("live cache metrics", {
|
||||
...snapshot,
|
||||
flushDelta,
|
||||
memorySnapshot,
|
||||
liveSubscriptions: subscriptionMetrics
|
||||
});
|
||||
|
||||
metrics.gauge("api.memory.rss_bytes", memoryUsage.rss);
|
||||
metrics.gauge("api.live.active_sockets", subscriptionMetrics.liveSocketCount);
|
||||
}, 60000);`
|
||||
},
|
||||
{
|
||||
id: "diff-config",
|
||||
name: "config excerpt",
|
||||
oldContents: `// apps/web/app/terminal.tsx
|
||||
const LIVE_HOT_WINDOW = parseBoundedInt(process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW, 600, 1, 100000);
|
||||
const LIVE_HOT_WINDOW_OPTIONS = parseBoundedInt(
|
||||
process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW_OPTIONS,
|
||||
1200,
|
||||
1,
|
||||
100000
|
||||
);
|
||||
|
||||
# .env.example
|
||||
LIVE_LIMIT_OPTIONS=1000
|
||||
LIVE_LIMIT_INFERRED_DARK=300`,
|
||||
newContents: `// apps/web/app/terminal.tsx
|
||||
const LIVE_HOT_WINDOW = parseBoundedInt(process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW, 600, 1, 2000);
|
||||
const LIVE_HOT_WINDOW_OPTIONS = parseBoundedInt(
|
||||
process.env.NEXT_PUBLIC_LIVE_HOT_WINDOW_OPTIONS,
|
||||
1200,
|
||||
1,
|
||||
2000
|
||||
);
|
||||
|
||||
# .env.example
|
||||
LIVE_LIMIT_OPTIONS=100
|
||||
LIVE_LIMIT_INFERRED_DARK=300
|
||||
LIVE_LIMIT_NEWS=100`
|
||||
}
|
||||
];
|
||||
|
||||
try {
|
||||
const { FileDiff } = await import("https://esm.sh/@pierre/diffs");
|
||||
for (const diff of diffs) {
|
||||
const container = document.getElementById(diff.id);
|
||||
if (!container) continue;
|
||||
const fileDiff = new FileDiff({ theme: "github-light" });
|
||||
fileDiff.render({
|
||||
oldFile: { name: diff.name, contents: diff.oldContents },
|
||||
newFile: { name: diff.name, contents: diff.newContents },
|
||||
containerWrapper: container
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn("Failed to load diffs.com renderer", error);
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Loading…
Add table
Add a link
Reference in a new issue