From d589858c03c6de8aa105fc9e7432a0720ba27b46 Mon Sep 17 00:00:00 2001 From: dirtydishes Date: Mon, 18 May 2026 03:34:24 -0400 Subject: [PATCH 1/2] Implement native fast iterative deploy workflow --- .beads/issues.jsonl | 15 +- README.md | 4 +- deployment/docker/README.md | 8 +- deployment/native/README.md | 216 ++++++++++++----- deployment/native/check-native-health.sh | 43 ++++ deployment/native/install-user-units.sh | 49 ++++ deployment/native/rollback.sh | 57 +++++ .../systemd/user/islandflow-api.service | 17 ++ .../systemd/user/islandflow-candles.service | 17 ++ .../systemd/user/islandflow-compute.service | 17 ++ .../user/islandflow-ingest-equities.service | 17 ++ .../user/islandflow-ingest-options.service | 17 ++ .../systemd/user/islandflow-web.service | 17 ++ ...-18-native-fast-iterative-deploy-plan.html | 93 ++++++++ ...26-05-18-native-fast-iterative-deploy.html | 153 ++++++++++++ ...05-18-native-fast-iterative-deploy-plan.md | 21 ++ scripts/deploy.ts | 222 ++++++++++++++---- 17 files changed, 873 insertions(+), 110 deletions(-) create mode 100755 deployment/native/check-native-health.sh create mode 100755 deployment/native/install-user-units.sh create mode 100755 deployment/native/rollback.sh create mode 100644 deployment/native/systemd/user/islandflow-api.service create mode 100644 deployment/native/systemd/user/islandflow-candles.service create mode 100644 deployment/native/systemd/user/islandflow-compute.service create mode 100644 deployment/native/systemd/user/islandflow-ingest-equities.service create mode 100644 deployment/native/systemd/user/islandflow-ingest-options.service create mode 100644 deployment/native/systemd/user/islandflow-web.service create mode 100644 docs/plans/2026-05-18-native-fast-iterative-deploy-plan.html create mode 100644 docs/turns/2026-05-18-native-fast-iterative-deploy.html create mode 100644 plans/2026-05-18-native-fast-iterative-deploy-plan.md diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index e025c4d..16eabf1 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -1,4 +1,4 @@ -{"_type":"issue","id":"islandflow-jbi","title":"Hydrate alert evidence details from ClickHouse","description":"Alert detail drawers need to fetch persisted alert context from ClickHouse by trace id, including linked flow packets, option prints, preserved execution context, and explicit missing refs for UI diagnostics.","status":"closed","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-17T14:55:43Z","created_by":"dirtydishes","updated_at":"2026-05-17T15:01:58Z","started_at":"2026-05-17T14:55:53Z","closed_at":"2026-05-17T15:01:58Z","close_reason":"Implemented ClickHouse-backed alert context hydration across storage, API, terminal drawer, tests, and turn documentation.","dependency_count":0,"dependent_count":0,"comment_count":0} +{"_type":"issue","id":"islandflow-9rc","title":"Implement native fast iterative deploy plan","description":"Implement the checked-in plan at plans/2026-05-18-native-fast-iterative-deploy-plan.md. Cover deploy-phase timing instrumentation, native deployment operational assets, deploy guardrails, validation/cutover documentation, and any required live VPS remediation that is safely actionable from this session. Track follow-up items separately if anything cannot be completed in-repo or on the live host.","status":"in_progress","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-18T07:15:19Z","created_by":"dirtydishes","updated_at":"2026-05-18T07:15:25Z","started_at":"2026-05-18T07:15:25Z","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-8kj","title":"Configure persistent beads Dolt remote on deltaisland server","description":"Install the beads and Dolt CLIs on the server, configure a persistent Dolt sync remote backed by the server-hosted Forgejo repository, verify refs/dolt/data publication, and document Nginx Proxy Manager / firewall considerations.","status":"closed","priority":1,"issue_type":"task","assignee":"delta","created_at":"2026-05-17T10:31:31Z","created_by":"delta","updated_at":"2026-05-17T10:37:47Z","started_at":"2026-05-17T10:32:16Z","closed_at":"2026-05-17T10:37:47Z","close_reason":"Installed bd and dolt on the server, configured the Forgejo-backed Dolt remote, published refs/dolt/data, and documented the setup.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-200","title":"Implement durable options tape history","description":"Implement the plan from docs/plans/2026-05-16-1711-durable-options-tape-history.html: durable ClickHouse-backed options history, signal/all prints view selection, preserved execution context, stale semantics limited to live health, reset runbook, tests, and turn documentation.","status":"closed","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-16T21:21:30Z","created_by":"dirtydishes","updated_at":"2026-05-16T21:26:51Z","started_at":"2026-05-16T21:21:33Z","closed_at":"2026-05-16T21:26:51Z","close_reason":"Implemented durable options tape history, signal/raw view selection, reset runbook, tests, and turn documentation.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-k4f","title":"Gate deploy script on docker workspace snapshot sync","description":"Prevent frozen-lockfile build failures during deploy by adding a local preflight in scripts/deploy.ts that runs bun run check:docker-workspace and aborts with a clear sync+commit remediation message when stale.","status":"closed","priority":1,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-15T23:01:44Z","created_by":"dirtydishes","updated_at":"2026-05-15T23:04:11Z","started_at":"2026-05-15T23:01:48Z","closed_at":"2026-05-15T23:04:11Z","close_reason":"Closed","dependency_count":0,"dependent_count":0,"comment_count":0} @@ -13,14 +13,11 @@ {"_type":"issue","id":"islandflow-ayo","title":"Drop stale backlog events from live fanout","description":"Follow-up to live freshness rollout: /ws/live was still fanning out stale backlog events for freshness-gated channels, which kept tape panes in Live feed behind despite active synthetic ingest. Gate fanout and cache ingest by freshness for options/nbbo/equities/flow.","status":"closed","priority":1,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T21:26:39Z","created_by":"dirtydishes","updated_at":"2026-04-28T21:26:44Z","started_at":"2026-04-28T21:26:44Z","closed_at":"2026-04-28T21:26:44Z","close_reason":"Completed","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-0v6","title":"Fix tape freshness, NBBO coverage, pause controls, and filter popup","description":"Implement the tape fixes requested for synthetic options notional sizing, strict live freshness, live-mode pause/resume behavior, stronger NBBO snapshot coverage, and moving flow filters behind a popup. Includes server-side live cache changes, web terminal state/UI changes, and tests for synthetic pricing, live snapshot freshness/NBBO retention, and live pause/filter interactions.","status":"closed","priority":1,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T21:02:52Z","created_by":"dirtydishes","updated_at":"2026-04-28T21:13:38Z","started_at":"2026-04-28T21:02:57Z","closed_at":"2026-04-28T21:13:38Z","close_reason":"Completed","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-e4r","title":"Implement smart-money flow filtering and synthetic firehose modes","description":"Implement the approved multi-surface plan for named synthetic market profiles, options raw-vs-signal filtering, live/API filter contracts, Tape page client-side flow filters, firehose-readiness improvements, tests, and README updates.","status":"closed","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T20:10:49Z","created_by":"dirtydishes","updated_at":"2026-04-28T20:29:29Z","started_at":"2026-04-28T20:10:53Z","closed_at":"2026-04-28T20:29:29Z","close_reason":"Implemented synthetic market profiles, options signal-path filtering, signal-aware API/replay contracts, Tape page filters, tests, and README updates. Follow-up tracked in islandflow-biq.","dependency_count":0,"dependent_count":0,"comment_count":0} -{"_type":"issue","id":"islandflow-1ei","title":"Make deploy helper remote-aware for Forgejo","description":"Why: scripts/deploy.ts hardcodes git remote name origin for fetch/pull/push and branch verification, but this repository now uses forgejo/github remotes and may not have an origin remote. What: update deploy.ts to resolve the deploy git remote robustly (Forgejo-aware), use it across local prechecks, branch publish, and remote rollout git operations, and keep behavior explicit in output.","status":"closed","priority":2,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-18T03:20:12Z","created_by":"dirtydishes","updated_at":"2026-05-18T03:22:39Z","started_at":"2026-05-18T03:20:16Z","closed_at":"2026-05-18T03:22:39Z","close_reason":"Closed","dependency_count":0,"dependent_count":0,"comment_count":0} -{"_type":"issue","id":"islandflow-xod","title":"Add --fast mode to deploy helper","description":"Why: full main deploys rebuild all images and run full verification, which is slow for routine rollouts. What: add a --fast flag to scripts/deploy.ts with explicit behavior that short-circuits slow steps while preserving basic safety checks; update help text/docs for discoverability.","status":"closed","priority":2,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-18T02:50:47Z","created_by":"dirtydishes","updated_at":"2026-05-18T02:53:41Z","started_at":"2026-05-18T02:50:50Z","closed_at":"2026-05-18T02:53:41Z","close_reason":"Closed","dependency_count":0,"dependent_count":0,"comment_count":0} -{"_type":"issue","id":"islandflow-cif","title":"hydrate alert evidence context from clickhouse","description":"Implement alert detail hydration from ClickHouse with a new context endpoint and frontend drawer evidence resolution. Includes storage lookup by alert trace_id/evidence refs, unresolved refs diagnostics, API route GET /flow/alerts/:trace_id/context, terminal evidence hydration + loading states/copy updates, and tests across storage/api/web.","status":"closed","priority":2,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-18T00:15:55Z","created_by":"dirtydishes","updated_at":"2026-05-18T00:17:38Z","started_at":"2026-05-18T00:16:00Z","closed_at":"2026-05-18T00:17:38Z","close_reason":"Closed","dependency_count":0,"dependent_count":0,"comment_count":0} -{"_type":"issue","id":"islandflow-4e9","title":"Polish terminal view","description":"Improve the Islandflow web terminal view with a focused UI polish pass aligned to the product design system.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-17T15:18:18Z","created_by":"dirtydishes","updated_at":"2026-05-17T15:25:02Z","started_at":"2026-05-17T15:18:21Z","closed_at":"2026-05-17T15:25:02Z","close_reason":"Polished terminal shell styling, responsive Tape actions, and documented the turn.","dependency_count":0,"dependent_count":0,"comment_count":0} -{"_type":"issue","id":"islandflow-lyt","title":"Summarize 2026-05-16 git activity for standup","description":"Create a grounded standup summary for yesterday's git activity, anchored to commits, changed files, and any linked PR context if present. Produce the required HTML document in docs/general and complete the beads + git handoff workflow.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-17T14:02:57Z","created_by":"dirtydishes","updated_at":"2026-05-17T14:05:37Z","started_at":"2026-05-17T14:03:09Z","closed_at":"2026-05-17T14:05:37Z","close_reason":"Created docs/general standup summary for 2026-05-16 git activity, grounded to commits and changed files, and prepared the repo handoff workflow.","dependency_count":0,"dependent_count":0,"comment_count":0} -{"_type":"issue","id":"islandflow-sz8","title":"Fix public /replay/options proxy regression","description":"## Summary\nThe new deploy-time public route checker added in commit 1424a27 (\"fix durable options history routing\") currently fails against https://flow.deltaisland.io because GET /replay/options returns HTML instead of JSON.\n\n## Evidence\n- `bun run scripts/check-public-api-routes.ts https://flow.deltaisland.io` fails on `/replay/options?view=signal\u0026after_ts=0\u0026after_seq=0\u0026limit=1` with `returned non-JSON content (text/html; charset=UTF-8)`\n- `services/api/src/index.ts` implements `GET /replay/options`, so the HTML response indicates the request is landing on the web app instead of the API service\n- `deployment/docker/README.md` documents that same-origin proxy mode must include `/replay/*` in the API route matcher\n\n## Minimal Fix\nUpdate the live reverse proxy / edge route matcher for flow.deltaisland.io so `/replay/*` is forwarded to the API host, then rerun `bun run check:public-api-routes`.\n\n## Notes\nThis looks like a production proxy configuration regression rather than an in-repo application bug.","status":"open","priority":2,"issue_type":"bug","owner":"dishes@dpdrm.com","created_at":"2026-05-17T13:06:11Z","created_by":"dirtydishes","updated_at":"2026-05-17T13:06:11Z","dependency_count":0,"dependent_count":0,"comment_count":0} +{"_type":"issue","id":"islandflow-vvw","title":"Stage native public-edge cutover after worker soak","description":"Why this issue exists and what needs to be done:\\n- The native deploy path is now provisioned for worker-first iteration, with checked-in user units, rollback helpers, and edge guardrails\\n- Remaining work is to enable and soak native worker units, validate duplicate-processing behavior, then deliberately cut over the public web/api edge if warranted\\n- Final acceptance should include deciding whether Docker or native becomes the default runtime after operational evidence","status":"open","priority":2,"issue_type":"task","owner":"dishes@dpdrm.com","created_at":"2026-05-18T07:32:35Z","created_by":"dirtydishes","updated_at":"2026-05-18T07:32:35Z","dependency_count":0,"dependent_count":0,"comment_count":0} +{"_type":"issue","id":"islandflow-bsg","title":"Fix public /replay/options proxy regression","description":"Restore correct public routing for GET /replay/options on flow.deltaisland.io. The app currently serves HTML for that API path, which indicates edge/proxy routing drift. Update the live proxy topology or deployment assets as needed, then validate with bun run scripts/check-public-api-routes.ts.","status":"closed","priority":2,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-18T07:15:19Z","created_by":"dirtydishes","updated_at":"2026-05-18T07:32:51Z","started_at":"2026-05-18T07:15:24Z","closed_at":"2026-05-18T07:32:51Z","close_reason":"Audited the live VPS and reverse proxy on 2026-05-18: public /replay/options now returns JSON, bun run scripts/check-public-api-routes.ts passes, and the active Nginx Proxy Manager config includes /replay in the API route matcher. No in-repo app code change was required.","dependency_count":0,"dependent_count":0,"comment_count":0} +{"_type":"issue","id":"islandflow-9j5","title":"Prepare PR for deploy allowlist cleanup","description":"Why this issue exists and what needs to be done:\\n- Package current deploy allowlist cleanup into a reviewable PR with multiple commits\\n- Add required turn documentation in docs/turns\\n- Run validation and push all artifacts","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-17T15:44:12Z","created_by":"dirtydishes","updated_at":"2026-05-17T15:53:55Z","started_at":"2026-05-17T15:44:22Z","closed_at":"2026-05-17T15:53:55Z","close_reason":"Packaged deploy allowlist cleanup into multi-commit PR branch with required turn documentation and push workflow.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-0sa","title":"Fix live tape auto-hold, history seam, and remove manual pause control","description":"The live tape should automatically hold when the user scrolls away from the top, resume when they return to the top or use Jump to top, and keep older prints available seamlessly beyond the hot window. Manual Pause/Resume control is now redundant and should be removed from live tape panes. This work should also fix the current regression where paused/held tapes still mutate, and align the options tape with a strict 100-row hot head backed by ClickHouse history.","notes":"Implemented live scroll-hold with no live pause button, demand-loaded ClickHouse history, a 100-row options hot head, and cache-first scoped snapshots. Validated with bun test apps/web/app/terminal.test.ts services/api/tests/live.test.ts and bun --cwd=apps/web run build.","status":"closed","priority":2,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-16T18:12:51Z","created_by":"dirtydishes","updated_at":"2026-05-16T18:23:43Z","started_at":"2026-05-16T18:12:54Z","closed_at":"2026-05-16T18:23:43Z","close_reason":"Closed","dependency_count":0,"dependent_count":0,"comment_count":0} -{"_type":"issue","id":"islandflow-2db","title":"Manually remove stale islandflow local-infra containers from VPS","description":"The live VPS still has an older compose project named islandflow created from the repo-root docker-compose.yml. Inspection shows it is separate from the supported islandflow-vps deployment stack and exposes NATS, ClickHouse, and Redis on host ports. Container removal commands currently hang when run as the delta user through Docker, so cleanup likely needs a focused maintenance window and possibly host-level intervention or a Docker daemon restart.","notes":"The duplicate islandflow compose project on the VPS was confirmed live during inspection. Nginx Proxy Manager routes public traffic only to islandflow-vps web/api by Docker name, so the stale islandflow project appears to be stray local-infra state rather than part of the supported production path. Attempts to remove the stale containers with docker compose down and docker rm -f as the delta user hung and timed out, so manual cleanup likely needs a maintenance window and possibly Docker daemon intervention.","status":"open","priority":2,"issue_type":"task","owner":"dishes@dpdrm.com","created_at":"2026-05-16T01:27:27Z","created_by":"dirtydishes","updated_at":"2026-05-16T01:28:59Z","dependency_count":0,"dependent_count":0,"comment_count":0} +{"_type":"issue","id":"islandflow-2db","title":"Manually remove stale islandflow local-infra containers from VPS","description":"The live VPS still has an older compose project named islandflow created from the repo-root docker-compose.yml. Inspection shows it is separate from the supported islandflow-vps deployment stack and exposes NATS, ClickHouse, and Redis on host ports. Container removal commands currently hang when run as the delta user through Docker, so cleanup likely needs a focused maintenance window and possibly host-level intervention or a Docker daemon restart.","notes":"The duplicate islandflow compose project on the VPS was confirmed live during inspection. Nginx Proxy Manager routes public traffic only to islandflow-vps web/api by Docker name, so the stale islandflow project appears to be stray local-infra state rather than part of the supported production path. Attempts to remove the stale containers with docker compose down and docker rm -f as the delta user hung and timed out, so manual cleanup likely needs a maintenance window and possibly Docker daemon intervention.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-16T01:27:27Z","created_by":"dirtydishes","updated_at":"2026-05-18T07:32:48Z","started_at":"2026-05-18T07:15:25Z","closed_at":"2026-05-18T07:32:48Z","close_reason":"Audited the live VPS on 2026-05-18: docker compose ls and container labels no longer show a duplicate islandflow compose project, so the stale local-infra stack cleanup appears to already be resolved on the host.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-c87","title":"Clean up duplicate Islandflow Docker infra on VPS","description":"The live VPS is currently running both the production-style islandflow-vps Docker stack and an older root-level islandflow infra stack that publishes NATS, ClickHouse, and Redis on host ports. Investigate whether the older stack is unused, remove it safely if so, and update docs/deploy guidance so the server topology is clearer.","notes":"Inspected the live VPS and confirmed the duplicate compose project: islandflow-vps is the supported deployment stack, while a separate islandflow project from the repo-root docker-compose.yml still runs exposed NATS/ClickHouse/Redis containers. Verified Nginx Proxy Manager routes only to islandflow-vps web/api by Docker name. Attempted cleanup via docker compose down and docker rm -f on the stale islandflow containers, but those commands hung for the delta user and timed out. Added repo guardrails and docs so deploy warns when the duplicate project exists, and opened islandflow-2db for manual host-level cleanup during a maintenance window.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-16T01:16:05Z","created_by":"dirtydishes","updated_at":"2026-05-16T01:28:07Z","started_at":"2026-05-16T01:16:09Z","closed_at":"2026-05-16T01:28:07Z","close_reason":"Completed the repo-side investigation and guardrails. Actual server-side container removal is blocked by hanging Docker operations and is tracked separately in islandflow-2db for a maintenance window.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-4gj","title":"Clarify Docker-first deploy workflow and mark native runtime experimental","description":"After inspecting the live VPS, native deployment is not ready for routine use: Nginx Proxy Manager routes to Docker container names, Bun is not installed on the host, sudo systemctl is not passwordless, and no Islandflow units exist. Update deploy messaging and docs so Docker remains the clearly recommended deployment path and native runtime is labeled experimental/future-facing with server prerequisites called out.","notes":"Updated deploy messaging and docs after live VPS inspection. scripts/deploy.ts now marks Docker as the default and recommended runtime, labels native as experimental, switches native systemctl default to sudo -n systemctl, and prints explicit native precheck failures for missing Bun/systemctl access/units. Updated README.md, deployment/docker/README.md, and deployment/native/README.md to reflect the current Docker + Nginx Proxy Manager topology. Validation: ./deploy --help, ./deploy main --runtime native --no-build (fails fast with Bun-missing message), bun run check:docker-workspace.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-16T01:10:11Z","created_by":"dirtydishes","updated_at":"2026-05-16T01:12:39Z","started_at":"2026-05-16T01:10:14Z","closed_at":"2026-05-16T01:12:39Z","close_reason":"Closed","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-7p2","title":"Fix deploy wrapper argument forwarding for runtime flags","description":"The repo-root deploy wrapper currently invokes bun run without a -- separator, so flags like --runtime native are treated as Bun CLI flags instead of script arguments. Update the wrapper so ./deploy main --runtime native forwards arguments correctly to scripts/deploy.ts.","notes":"Cherry-picked the dual-runtime deploy workflow onto main and fixed the repo-root deploy wrapper to call Bun with a -- separator so flags like --runtime native are forwarded to scripts/deploy.ts correctly. Validation: ./deploy --help, ./deploy main --runtime native --force-recreate guard, bun run check:docker-workspace.","status":"closed","priority":2,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-16T00:51:05Z","created_by":"dirtydishes","updated_at":"2026-05-16T00:52:34Z","started_at":"2026-05-16T00:51:10Z","closed_at":"2026-05-16T00:52:34Z","close_reason":"Closed","dependency_count":0,"dependent_count":0,"comment_count":0} @@ -47,5 +44,5 @@ {"_type":"issue","id":"islandflow-igk","title":"Add plan mode","description":"Implement a user-facing plan mode in the application so users can switch into planning before taking action. Scope to be clarified from existing app patterns.","status":"closed","priority":2,"issue_type":"feature","owner":"dishes@dpdrm.com","created_at":"2026-05-04T04:22:37Z","created_by":"dirtydishes","updated_at":"2026-05-04T04:26:18Z","started_at":"2026-05-04T04:22:40Z","closed_at":"2026-05-04T04:26:18Z","close_reason":"Implemented as a global pi extension toggled with Shift+P","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-biq","title":"Finish raw live options delivery and filter/backpressure observability","description":"The smart-money signal path and Tape filters are in place, but the next firehose pass should finish server-side selective raw live delivery for options subscriptions and add explicit filtered-out/backpressure observability for API/web counters. This was discovered while landing islandflow-e4r.\n","status":"in_progress","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T20:28:58Z","created_by":"dirtydishes","updated_at":"2026-04-29T03:54:12Z","started_at":"2026-04-29T03:54:12Z","dependencies":[{"issue_id":"islandflow-biq","depends_on_id":"islandflow-e4r","type":"discovered-from","created_at":"2026-04-28T16:28:58Z","created_by":"auto-import","metadata":"{}"}],"dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-zsy","title":"Expose Forgejo SSH on a direct DNS hostname","description":"git.deltaisland.io currently resolves through Cloudflare's proxy, so SSH on port 2222 does not complete even though the Forgejo container is listening on the host. If SSH-based git/beads workflows are desired, add a DNS-only hostname (or adjust the existing record) that points directly at the server for Forgejo SSH.","status":"open","priority":3,"issue_type":"task","created_at":"2026-05-17T10:34:06Z","created_by":"delta","updated_at":"2026-05-17T10:34:06Z","dependency_count":0,"dependent_count":0,"comment_count":0} -{"_type":"issue","id":"islandflow-38p","title":"Add native deployment unit templates and rollback helpers","description":"The deploy helper now supports --runtime native, but the repo still relies on operator-managed systemd units and manual rollback. Add checked-in native deployment templates or provisioning guidance for the expected units, and consider lightweight rollback/smoke-test helpers once the host-native path is exercised on the real VPS.","status":"open","priority":3,"issue_type":"task","owner":"dishes@dpdrm.com","created_at":"2026-05-15T23:46:42Z","created_by":"dirtydishes","updated_at":"2026-05-15T23:46:42Z","dependency_count":0,"dependent_count":0,"comment_count":0} +{"_type":"issue","id":"islandflow-38p","title":"Add native deployment unit templates and rollback helpers","description":"The deploy helper now supports --runtime native, but the repo still relies on operator-managed systemd units and manual rollback. Add checked-in native deployment templates or provisioning guidance for the expected units, and consider lightweight rollback/smoke-test helpers once the host-native path is exercised on the real VPS.","status":"closed","priority":3,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-15T23:46:42Z","created_by":"dirtydishes","updated_at":"2026-05-18T07:34:02Z","started_at":"2026-05-18T07:15:25Z","closed_at":"2026-05-18T07:34:02Z","close_reason":"Added checked-in native user unit templates, install/smoke-test/rollback helpers, updated native deploy docs with worker-first guidance, and installed the unit files onto the VPS in disabled form.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-575","title":"Document smart-money event calendar env","description":"Document smart-money event-calendar environment configuration in env examples and README.\n","status":"closed","priority":3,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-05T06:57:14Z","created_by":"dirtydishes","updated_at":"2026-05-05T06:57:57Z","started_at":"2026-05-05T06:57:17Z","closed_at":"2026-05-05T06:57:57Z","close_reason":"Documented event-calendar env variables","dependency_count":0,"dependent_count":0,"comment_count":0} diff --git a/README.md b/README.md index 50063d9..98d0936 100644 --- a/README.md +++ b/README.md @@ -129,8 +129,10 @@ This keeps Docker in the local workflow where it helps most (NATS, ClickHouse, R - `./deploy main` keeps the current VPS Docker rollout path as the default and recommended path. - Do not run the repo-root `docker-compose.yml` on the VPS. That file is for local infra only and can create duplicate exposed NATS, ClickHouse, and Redis containers on the server. - `./deploy main --runtime native` targets an experimental host-native Bun + systemd deployment. +- Native deploys are now intended primarily for worker-only fast iteration until the public edge is cut over deliberately. - `./deploy current-branch` and `./deploy current-branch --runtime native` keep branch deploys available during the transition, but Docker remains the supported path for the current VPS. -- Partial deploys are supported with `--web-only`, `--api-only`, `--services-only`, and `--no-build`. +- Partial deploys are supported with `--web-only`, `--api-only`, `--services-only`, `--workers-only`, and `--no-build`. +- When run from `/home/delta/islandflow` on the VPS itself, `./deploy` can execute locally instead of SSHing back into the same server. - Docker runtime details live in `deployment/docker/README.md`. - Native runtime expectations and prerequisites live in `deployment/native/README.md`. diff --git a/deployment/docker/README.md b/deployment/docker/README.md index 2b167da..ed80c53 100644 --- a/deployment/docker/README.md +++ b/deployment/docker/README.md @@ -217,13 +217,15 @@ The current live VPS uses Nginx Proxy Manager on the shared Docker network and r The deploy helper also warns if it detects a second compose project named `islandflow` on the server, because that usually means the repo-root local-infra stack was started on the VPS by mistake. -The checked-in deploy helper is meant to run from your local repo checkout, not from the VPS shell. It always targets: +The checked-in deploy helper normally runs from your local repo checkout and targets: - SSH host: `delta@152.53.80.229` -- SSH key: `~/.ssh/delta_ed25519` +- SSH key: `~/.ssh/delta_ed25519` by default - Live repo checkout: `/home/delta/islandflow` - Live compose directory: `/home/delta/islandflow/deployment/docker` +If you run `./deploy` from `/home/delta/islandflow` on the VPS itself, it now executes the remote steps locally instead of SSHing back into the same machine. You can still force SSH with `DEPLOY_FORCE_SSH=1`, or override the key path with `DEPLOY_SSH_KEY_PATH=/path/to/key`. + It preserves the current Docker Compose project and avoids destructive cleanup on the server. ### Deploy `origin/main` @@ -271,6 +273,7 @@ Examples: ./deploy main --runtime docker --web-only ./deploy main --runtime docker --api-only ./deploy current-branch --runtime docker --services-only +./deploy main --runtime docker --workers-only ./deploy main --runtime docker --fast ./deploy main --runtime docker --web-only --no-build ``` @@ -280,6 +283,7 @@ Scoped Docker deploys now build only the selected image set and then restart onl - `--web-only`: `docker compose build web`, then `docker compose up -d web` - `--api-only`: `docker compose build api`, then `docker compose up -d api` - `--services-only`: builds and restarts `api`, `compute`, `candles`, `ingest-options`, and `ingest-equities` +- `--workers-only`: builds and restarts `compute`, `candles`, `ingest-options`, and `ingest-equities` without touching `web` or `api` - `--fast`: when no explicit scope flag is given, treats the deploy as `--services-only` and skips the public API route suite for quicker completion. It still runs remote service health checks. Use `--no-build` only when the image is already correct and you need Compose to recreate or restart containers, such as after changing server-side environment values that do not affect a Next.js build-time variable. Do not use `--no-build` for dependency changes, application source changes, or `NEXT_PUBLIC_*` changes. diff --git a/deployment/native/README.md b/deployment/native/README.md index a9903cc..4e2dd52 100644 --- a/deployment/native/README.md +++ b/deployment/native/README.md @@ -1,29 +1,114 @@ # Native Deployment -This directory documents the experimental host-native Islandflow rollout path used by: +This directory documents the host-native Islandflow rollout path used by: ```bash ./deploy main --runtime native ./deploy current-branch --runtime native ``` -This runtime is intended for faster server iteration during the transition away from Docker-only app rollouts. It is not the recommended path for the current production VPS, which still uses Nginx Proxy Manager to reach the Docker `web` and `api` containers by container name on the shared Docker network. Local development should still prefer: +## Current operating model -- Docker for infra (`bun run dev:infra`) -- native Bun services (`bun run dev:services`) -- native Next.js web (`bun run dev:web`) +Native runtime is now intended for **fast iterative backend deploys first**, while Docker remains the supported public production edge until a deliberate cutover is completed. + +Today, the recommended split is: + +- **Docker runtime** for the live public `web` + `api` path +- **Native runtime** for worker-only iteration (`compute`, `candles`, `ingest-options`, `ingest-equities`) +- local development stays: + - Docker infra: `bun run dev:infra` + - native backend services: `bun run dev:services` + - native web: `bun run dev:web` ## What native deploy means here The checked-in `deploy` helper assumes: -- the live repo checkout is still `/home/delta/islandflow` +- the live repo checkout is `/home/delta/islandflow` - Bun is installed on the VPS -- app processes are managed by `systemd` -- infrastructure services such as NATS, ClickHouse, and Redis are already reachable from the host +- app processes are managed by `systemd --user` +- infrastructure services such as NATS, ClickHouse, and Redis are reachable from the host - the web app runs from `apps/web` and is served with `next start -p 3000` -The deploy script updates the repo checkout, optionally runs `bun install --frozen-lockfile`, optionally rebuilds the web app, restarts the target systemd units, and then verifies the services locally on the VPS plus through the public app URL. +The deploy script updates the repo checkout, optionally runs `bun install --frozen-lockfile`, optionally rebuilds the web app, restarts the target user units, verifies local health, and then runs public verification when the selected scope includes the public edge. + +## Live audit status on 2026-05-18 + +The plan assumptions were audited on the VPS: + +- `bun` is installed and available at `/home/delta/.bun/bin/bun` +- `systemctl --user` is available and the `delta` user has lingering enabled +- `/home/delta/islandflow/.env` exists +- public `https://flow.deltaisland.io/replay/options` routing is healthy again +- the previously reported duplicate `islandflow` compose project is not currently present in `docker compose ls` +- native Islandflow user units were not installed at the start of the audit; this change now provides and installs the checked-in user unit files, but they remain disabled until an operator enables a scope intentionally + +That means native worker deploy support is now provisioned on the host, but native runtime should still be enabled scope-by-scope rather than started wholesale. + +## Checked-in native ops assets + +### User unit templates + +Checked-in unit files live under: + +- `deployment/native/systemd/user/islandflow-web.service` +- `deployment/native/systemd/user/islandflow-api.service` +- `deployment/native/systemd/user/islandflow-compute.service` +- `deployment/native/systemd/user/islandflow-candles.service` +- `deployment/native/systemd/user/islandflow-ingest-options.service` +- `deployment/native/systemd/user/islandflow-ingest-equities.service` + +These are written for the current VPS layout: + +- repo root: `/home/delta/islandflow` +- Bun binary: `/home/delta/.bun/bin/bun` +- env file: `/home/delta/islandflow/.env` + +### Install the units + +```bash +./deployment/native/install-user-units.sh +./deployment/native/install-user-units.sh workers +systemctl --user start islandflow-compute.service +``` + +Install script behavior: + +- copies the checked-in unit files into `~/.config/systemd/user` +- reloads the user systemd daemon +- enables only the scope you explicitly request +- defaults to installing without enabling anything yet + +### Smoke test helper + +```bash +./deployment/native/check-native-health.sh workers +./deployment/native/check-native-health.sh services +./deployment/native/check-native-health.sh full +``` + +This validates: + +- `systemctl --user is-active` for the selected units +- local API health at `http://127.0.0.1:4000/health` when API scope is included +- local web health at `http://127.0.0.1:3000/` when web scope is included + +### Rollback helper + +```bash +./deployment/native/rollback.sh workers +./deployment/native/rollback.sh services +``` + +Rollback helper behavior: + +- requires a clean repo state +- fetches refs +- switches the checkout to a detached target ref +- reruns `bun install --frozen-lockfile` +- rebuilds the web app only when web scope is included +- restarts the selected user units +- runs the native smoke checks ## Expected unit names @@ -54,87 +139,104 @@ Available overrides: ## systemctl invocation -By default the deploy helper uses: - -```bash -sudo -n systemctl -``` - -If the server uses user units or another wrapper, override it locally before invoking `./deploy`: +For the checked-in user units, use: ```bash export DEPLOY_NATIVE_SYSTEMCTL_PREFIX="systemctl --user" -./deploy main --runtime native ``` +The deploy helper defaults to `sudo -n systemctl`, but that is only appropriate if you intentionally install matching system units. + ## Partial native rollouts Examples: ```bash -./deploy main --runtime native --web-only -./deploy main --runtime native --api-only -./deploy current-branch --runtime native --services-only +./deploy main --runtime native --workers-only ./deploy main --runtime native --fast -./deploy main --runtime native --web-only --no-build +./deploy main --runtime native --services-only +./deploy main --runtime native --web-only +./deploy current-branch --runtime native --workers-only --no-build ``` Scope behavior: -- default: restart web + API + backend services +- default: restart web + API + worker services - `--web-only`: rebuild/restart only the web unit - `--api-only`: restart only the API unit -- `--services-only`: restart API + backend units without touching the web unit -- `--fast`: when no explicit scope flag is provided, uses the same `--services-only` scope and trims verbose verification output for quicker completion +- `--services-only`: restart API + worker units without touching the web unit +- `--workers-only`: restart only `compute`, `candles`, `ingest-options`, and `ingest-equities` +- `--fast`: when no explicit scope flag is provided, native deploys now default to `--workers-only` - `--no-build`: skip `bun install --frozen-lockfile` and skip the web build step -## Current status +## Edge-cutover guardrail -On the current live VPS, native deploys should be treated as opt-in infrastructure work, not the default rollout path. Before a native deploy can succeed there, all of the following must be true at the same time: - -- Bun is installed on the host. -- The selected `systemctl` command works non-interactively. -- Islandflow systemd units exist for the requested scope. -- Host-native services can reach the intended NATS, ClickHouse, and Redis endpoints. -- If `web` or `api` move native, the reverse proxy topology is updated deliberately. - -Until that is prepared intentionally, prefer: +Native deploys that touch the public web or API edge are intentionally blocked unless you acknowledge cutover readiness: ```bash -./deploy main --runtime docker -./deploy current-branch --runtime docker +export DEPLOY_NATIVE_EDGE_READY=1 ``` -## Server preparation checklist +Without that variable, these commands are refused: -Before the first native rollout, ensure the VPS has: +- `./deploy main --runtime native` +- `./deploy main --runtime native --web-only` +- `./deploy main --runtime native --api-only` +- `./deploy main --runtime native --services-only` -1. Bun installed and on `PATH` -2. a working `/home/delta/islandflow/.env` (or unit-managed equivalent env source) -3. systemd units for each target service -4. the web unit configured to serve the built app on port `3000` -5. the API unit configured to serve health checks on port `4000` -6. infrastructure endpoints configured so the native services can reach NATS, ClickHouse, and Redis +This keeps the native path focused on safe worker iteration until proxy routing and public unit ownership are switched deliberately. -## Verification +## Running deploy from the VPS itself -Native deploys verify: +If you run `./deploy` from `/home/delta/islandflow` on the live server, the deploy helper now executes the remote steps locally instead of SSHing back into the same machine. -- target units are active via `systemctl` -- recent unit status and journal output can be collected -- local `http://127.0.0.1:4000/health` when API scope is included -- local `http://127.0.0.1:3000/` when web scope is included -- the public app URL from the local machine after the rollout finishes +That means: -## Rollback +- no SSH key is required for on-server deploy execution +- timing and verification behavior stay the same +- you can still force SSH with `DEPLOY_FORCE_SSH=1` +- you can override the SSH key path with `DEPLOY_SSH_KEY_PATH=/path/to/key` -Rollback remains manual for now: +## Validation matrix -1. switch the server checkout back to the last known-good branch or commit -2. rerun the appropriate native deploy command -3. if needed, restart only the affected units with `systemctl` +| Area | Native workers-only | Native edge cutover | +| --- | --- | --- | +| Bun installed | required | required | +| `systemctl --user` works | required | required | +| Islandflow user units installed | worker units only | all units | +| Host access to NATS/ClickHouse/Redis | required | required | +| Proxy routes updated for `/prints`, `/history`, `/replay`, `/nbbo`, `/ws`, `/flow`, `/candles` | not required | required | +| Public app check | not required | required | +| Public API route suite | not required | required | -Docker remains the fallback and currently recommended runtime during the transition: +## Staged cutover plan + +1. **Stage 1: native workers only** + - install user units + - validate `./deployment/native/check-native-health.sh workers` + - use `./deploy main --runtime native --fast` +2. **Stage 2: native API behind local-only verification** + - start `islandflow-api.service` + - confirm `curl http://127.0.0.1:4000/health` + - do not switch public routing yet +3. **Stage 3: deliberate public edge cutover** + - update proxy routing to native `web`/`api` + - export `DEPLOY_NATIVE_EDGE_READY=1` + - run full native deploy + - validate `bun run scripts/check-public-api-routes.ts https://flow.deltaisland.io` +4. **Stage 4: decide final default runtime** + - keep Docker as fallback until native edge has proven stable + +## Recommended current commands + +Fast backend iteration before edge cutover: + +```bash +export DEPLOY_NATIVE_SYSTEMCTL_PREFIX="systemctl --user" +./deploy main --runtime native --fast +``` + +Supported production path today: ```bash ./deploy main --runtime docker diff --git a/deployment/native/check-native-health.sh b/deployment/native/check-native-health.sh new file mode 100755 index 0000000..1d070e5 --- /dev/null +++ b/deployment/native/check-native-health.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +set -euo pipefail + +scope="${1:-full}" +units=() + +case "$scope" in + full) + units=(islandflow-web.service islandflow-api.service islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service) + ;; + web) + units=(islandflow-web.service) + ;; + api) + units=(islandflow-api.service) + ;; + services) + units=(islandflow-api.service islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service) + ;; + workers) + units=(islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service) + ;; + *) + echo "Unknown scope: $scope" >&2 + echo "Expected one of: full, web, api, services, workers" >&2 + exit 1 + ;; +esac + +for unit in "${units[@]}"; do + systemctl --user is-active --quiet "$unit" + echo "ok $unit" +done + +if [[ " ${units[*]} " == *" islandflow-api.service "* ]]; then + curl -fksS http://127.0.0.1:4000/health >/dev/null + echo "ok api-health" +fi + +if [[ " ${units[*]} " == *" islandflow-web.service "* ]]; then + curl -I -fksS http://127.0.0.1:3000/ >/dev/null + echo "ok web-health" +fi diff --git a/deployment/native/install-user-units.sh b/deployment/native/install-user-units.sh new file mode 100755 index 0000000..350cab1 --- /dev/null +++ b/deployment/native/install-user-units.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -euo pipefail + +scope="${1:-none}" +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +unit_source_dir="$repo_root/deployment/native/systemd/user" +unit_target_dir="${XDG_CONFIG_HOME:-$HOME/.config}/systemd/user" +units=() + +case "$scope" in + none) + ;; + full) + units=(islandflow-web.service islandflow-api.service islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service) + ;; + web) + units=(islandflow-web.service) + ;; + api) + units=(islandflow-api.service) + ;; + services) + units=(islandflow-api.service islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service) + ;; + workers) + units=(islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service) + ;; + *) + echo "Unknown scope: $scope" >&2 + echo "Expected one of: none, full, web, api, services, workers" >&2 + exit 1 + ;; +esac + +mkdir -p "$unit_target_dir" +cp "$unit_source_dir"/*.service "$unit_target_dir"/ + +systemctl --user daemon-reload + +if [[ ${#units[@]} -gt 0 ]]; then + systemctl --user enable "${units[@]}" +fi + +echo "Installed Islandflow user units into $unit_target_dir" +if [[ ${#units[@]} -gt 0 ]]; then + echo "Enabled scope: $scope" +else + echo "No units enabled yet. Pass a scope such as workers when you are ready." +fi \ No newline at end of file diff --git a/deployment/native/rollback.sh b/deployment/native/rollback.sh new file mode 100755 index 0000000..fb472d9 --- /dev/null +++ b/deployment/native/rollback.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ $# -lt 1 || $# -gt 2 ]]; then + echo "Usage: deployment/native/rollback.sh [full|web|api|services|workers]" >&2 + exit 1 +fi + +ref="$1" +scope="${2:-services}" +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +cd "$repo_root" + +if [[ -n "$(git status --porcelain=v1)" ]]; then + echo "Refusing rollback with a dirty working tree." >&2 + exit 1 +fi + +current_ref="$(git rev-parse --short HEAD)" +echo "Rolling back from $current_ref to $ref (scope: $scope)" + +git fetch --all --prune +git switch --detach "$ref" +bun install --frozen-lockfile + +if [[ "$scope" == "full" || "$scope" == "web" ]]; then + bun --cwd=apps/web run build +fi + +case "$scope" in + full) + units=(islandflow-web.service islandflow-api.service islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service) + ;; + web) + units=(islandflow-web.service) + ;; + api) + units=(islandflow-api.service) + ;; + services) + units=(islandflow-api.service islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service) + ;; + workers) + units=(islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service) + ;; + *) + echo "Unknown scope: $scope" >&2 + exit 1 + ;; +esac + +systemctl --user restart "${units[@]}" +"$repo_root/deployment/native/check-native-health.sh" "$scope" + +echo "Rollback complete. Repo is now detached at $(git rev-parse --short HEAD)." +echo "Return to tracked main later with: git switch main && git pull --ff-only main" diff --git a/deployment/native/systemd/user/islandflow-api.service b/deployment/native/systemd/user/islandflow-api.service new file mode 100644 index 0000000..5a74500 --- /dev/null +++ b/deployment/native/systemd/user/islandflow-api.service @@ -0,0 +1,17 @@ +[Unit] +Description=Islandflow API +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=/home/delta/islandflow +EnvironmentFile=/home/delta/islandflow/.env +ExecStart=/home/delta/.bun/bin/bun services/api/src/index.ts +Restart=always +RestartSec=2 +KillSignal=SIGINT +TimeoutStopSec=20 + +[Install] +WantedBy=default.target diff --git a/deployment/native/systemd/user/islandflow-candles.service b/deployment/native/systemd/user/islandflow-candles.service new file mode 100644 index 0000000..585b37c --- /dev/null +++ b/deployment/native/systemd/user/islandflow-candles.service @@ -0,0 +1,17 @@ +[Unit] +Description=Islandflow candles +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=/home/delta/islandflow +EnvironmentFile=/home/delta/islandflow/.env +ExecStart=/home/delta/.bun/bin/bun services/candles/src/index.ts +Restart=always +RestartSec=2 +KillSignal=SIGINT +TimeoutStopSec=20 + +[Install] +WantedBy=default.target diff --git a/deployment/native/systemd/user/islandflow-compute.service b/deployment/native/systemd/user/islandflow-compute.service new file mode 100644 index 0000000..603f252 --- /dev/null +++ b/deployment/native/systemd/user/islandflow-compute.service @@ -0,0 +1,17 @@ +[Unit] +Description=Islandflow compute +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=/home/delta/islandflow +EnvironmentFile=/home/delta/islandflow/.env +ExecStart=/home/delta/.bun/bin/bun services/compute/src/index.ts +Restart=always +RestartSec=2 +KillSignal=SIGINT +TimeoutStopSec=20 + +[Install] +WantedBy=default.target diff --git a/deployment/native/systemd/user/islandflow-ingest-equities.service b/deployment/native/systemd/user/islandflow-ingest-equities.service new file mode 100644 index 0000000..837a04f --- /dev/null +++ b/deployment/native/systemd/user/islandflow-ingest-equities.service @@ -0,0 +1,17 @@ +[Unit] +Description=Islandflow ingest-equities +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=/home/delta/islandflow +EnvironmentFile=/home/delta/islandflow/.env +ExecStart=/home/delta/.bun/bin/bun services/ingest-equities/src/index.ts +Restart=always +RestartSec=2 +KillSignal=SIGINT +TimeoutStopSec=20 + +[Install] +WantedBy=default.target diff --git a/deployment/native/systemd/user/islandflow-ingest-options.service b/deployment/native/systemd/user/islandflow-ingest-options.service new file mode 100644 index 0000000..eac0a6c --- /dev/null +++ b/deployment/native/systemd/user/islandflow-ingest-options.service @@ -0,0 +1,17 @@ +[Unit] +Description=Islandflow ingest-options +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=/home/delta/islandflow +EnvironmentFile=/home/delta/islandflow/.env +ExecStart=/home/delta/.bun/bin/bun services/ingest-options/src/index.ts +Restart=always +RestartSec=2 +KillSignal=SIGINT +TimeoutStopSec=20 + +[Install] +WantedBy=default.target diff --git a/deployment/native/systemd/user/islandflow-web.service b/deployment/native/systemd/user/islandflow-web.service new file mode 100644 index 0000000..6e79177 --- /dev/null +++ b/deployment/native/systemd/user/islandflow-web.service @@ -0,0 +1,17 @@ +[Unit] +Description=Islandflow web +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=/home/delta/islandflow +EnvironmentFile=/home/delta/islandflow/.env +ExecStart=/home/delta/.bun/bin/bun --cwd apps/web run start +Restart=always +RestartSec=2 +KillSignal=SIGINT +TimeoutStopSec=20 + +[Install] +WantedBy=default.target diff --git a/docs/plans/2026-05-18-native-fast-iterative-deploy-plan.html b/docs/plans/2026-05-18-native-fast-iterative-deploy-plan.html new file mode 100644 index 0000000..98fff10 --- /dev/null +++ b/docs/plans/2026-05-18-native-fast-iterative-deploy-plan.html @@ -0,0 +1,93 @@ + + + + + + Plan: Native Fast Iterative Deployment + + + +

Plan: Native, Fast, Iterative Deployment (Docker Optional)

+

Date: 2026-05-18

+ +
+

Plan Summary

+

Define and execute a fast iteration deployment path centered on host-native services, while preserving Docker as a fallback/runtime option.

+
+ +
+

Goals

+
    +
  • Reduce deploy turnaround time immediately.
  • +
  • Identify concrete bottlenecks with timing evidence.
  • +
  • Stabilize proxy/runtime topology for reliable production rollouts.
  • +
  • Support both native and Docker strategies with explicit guardrails.
  • +
+
+ +
+

Proposed Changes

+
    +
  • Use scoped fast deploys short-term.
  • +
  • Audit and remediate server-state blockers (duplicate compose/project drift).
  • +
  • Prepare native runtime prerequisites and checked-in operational assets.
  • +
  • Add deployment strategy prechecks, validation matrix, and staged cutover.
  • +
+
+ +
+

Relevant Context

+
    +
  • Open issue islandflow-2db: stale duplicate compose stack cleanup.
  • +
  • Open issue islandflow-sz8: public /replay/options proxy regression.
  • +
  • Open issue islandflow-38p: native unit templates and rollback helpers.
  • +
+
+ +
+

Implementation Steps

+
    +
  1. Stop the bleeding immediately (current deploy loop).
  2. +
  3. Get hard timing data per deploy phase.
  4. +
  5. Live server state audit (when plan mode is off).
  6. +
  7. Resolve duplicate compose stack first (islandflow-2db).
  8. +
  9. Fix NPM proxy route regression (islandflow-sz8).
  10. +
  11. Define target iterative deployment model.
  12. +
  13. Prepare native runtime prerequisites on VPS.
  14. +
  15. Checked-in native ops assets (islandflow-38p).
  16. +
  17. Switch proxy topology for native mode carefully.
  18. +
  19. Deploy strategy guardrails.
  20. +
  21. Validation matrix.
  22. +
  23. Staged cutover plan.
  24. +
  25. Decision: final default runtime.
  26. +
  27. Decision: optimization priority.
  28. +
  29. Decision: immediate live audit kickoff.
  30. +
+
+ +
+

Risks, Limitations, and Mitigations

+
    +
  • Risk: native runtime not yet production-hardened. Mitigation: keep Docker fallback and explicit gating.
  • +
  • Risk: proxy misrouting breaks API routes. Mitigation: route checks and post-change smoke validation.
  • +
  • Risk: operational drift on VPS. Mitigation: preflight audits and documented rollback steps.
  • +
+
+ +
+

Open Questions

+
    +
  • Should native become the default runtime now, or after hardening milestones?
  • +
  • Should backend iteration speed be prioritized ahead of web deploy speed?
  • +
  • Do we start immediate live server audit as soon as plan mode is disabled?
  • +
+
+ + diff --git a/docs/turns/2026-05-18-native-fast-iterative-deploy.html b/docs/turns/2026-05-18-native-fast-iterative-deploy.html new file mode 100644 index 0000000..45cba6c --- /dev/null +++ b/docs/turns/2026-05-18-native-fast-iterative-deploy.html @@ -0,0 +1,153 @@ + + + + + + 2026-05-18: Native fast iterative deploy + + + +
+
Turn document · 2026-05-18 03:29 EDT · Issues: islandflow-9rc, islandflow-38p, islandflow-bsg, islandflow-2db
+

Native fast iterative deploy

+

Implemented the native-first iterative deploy plan by adding deploy timing output, a safe worker-only native fast path, checked-in systemd user units and rollback helpers, server-local deploy execution, and updated live-operational documentation based on a fresh VPS audit.

+ +
+

Summary

+

The deploy flow now supports a safer native worker iteration model without requiring public edge cutover first. It can run directly from the VPS checkout without SSH, emits phase timings, includes checked-in native unit files plus install/rollback/smoke-test helpers, and documents the staged cutover path. During live audit, the previously reported /replay/options proxy issue and duplicate islandflow compose stack were both confirmed resolved on the host.

+
+ +
+

Changes Made

+
    +
  • Extended scripts/deploy.ts with deploy timing summaries for precheck, rollout, and verification phases.
  • +
  • Added --workers-only deploy scope for Docker and native runtimes.
  • +
  • Changed native --fast behavior so default full-scope fast deploys become worker-only instead of touching web/API.
  • +
  • Added native edge guardrails via DEPLOY_NATIVE_EDGE_READY=1 before web/API native deploys are allowed.
  • +
  • Added local-server execution mode so ./deploy can run from /home/delta/islandflow without SSHing back into the same host.
  • +
  • Added DEPLOY_SSH_KEY_PATH and DEPLOY_FORCE_SSH overrides for operators with non-default SSH setups.
  • +
  • Checked in native ops assets under deployment/native/:
  • +
  • install-user-units.sh, check-native-health.sh, rollback.sh
  • +
  • six user unit files in deployment/native/systemd/user/
  • +
  • Updated README.md, deployment/docker/README.md, and deployment/native/README.md to document the worker-first model, local execution mode, validation matrix, and staged cutover guidance.
  • +
  • Synced deployment/docker/workspace-root/package.json so Docker workspace validation passes again.
  • +
  • Installed the checked-in user unit files onto the live VPS in disabled form under ~/.config/systemd/user.
  • +
+
+ +
+

Context

+

The plan targeted faster deployment iteration while avoiding a premature move of the public edge away from the current Docker + Nginx Proxy Manager topology. The practical target was to make native runtime useful immediately for backend-worker iteration, while leaving web/API cutover deliberate and reversible.

+
+ +
+

Important Implementation Details

+
    +
  • Native fast mode now defaults to --workers-only; Docker fast mode still defaults to --services-only.
  • +
  • Native deploys that include public web/API scope now fail fast unless DEPLOY_NATIVE_EDGE_READY=1 is set.
  • +
  • Running from the live VPS checkout automatically switches deploy execution from SSH mode to local mode.
  • +
  • The checked-in native unit files are user units aimed at the current VPS layout: /home/delta/islandflow and /home/delta/.bun/bin/bun.
  • +
  • install-user-units.sh now installs units safely without enabling anything by default; enabling is explicit and scope-based.
  • +
  • rollback.sh intentionally uses a detached git ref to make one-off native rollback practical without rewriting branch history.
  • +
+
export DEPLOY_NATIVE_SYSTEMCTL_PREFIX="systemctl --user"
+./deploy main --runtime native --fast
+# resolves to worker-only native deploy before public edge cutover
+
+ +
+

Expected Impact for End-Users

+

End-users should see indirect benefits first: faster backend iteration, safer operational changes, and clearer rollback paths. Public traffic behavior should remain unchanged until a deliberate native edge cutover is performed.

+
+ +
+

Validation

+
    +
  • Passed: bun run scripts/check-public-api-routes.ts https://flow.deltaisland.io
  • +
  • Passed: direct public /replay/options curl returned JSON
  • +
  • Passed: live Nginx Proxy Manager config contains /replay in the API route matcher
  • +
  • Passed: docker compose ls shows no duplicate islandflow project
  • +
  • Passed: bash -n deployment/native/install-user-units.sh deployment/native/check-native-health.sh deployment/native/rollback.sh
  • +
  • Passed: systemd-analyze verify deployment/native/systemd/user/*.service
  • +
  • Passed: bun run check:docker-workspace after syncing workspace snapshot
  • +
  • Passed: native edge guard refusal for bun run scripts/deploy.ts main --runtime native --web-only --no-build
  • +
  • Passed: ./deployment/native/install-user-units.sh followed by systemctl --user list-unit-files 'islandflow*'
  • +
+
+ +
+

Issues, Limitations, and Mitigations

+
    +
  • Native units were installed but not enabled or started. This is intentional to avoid conflicting with the current Docker production edge.
  • +
  • Public web/API native deploys are still gated. Mitigation: explicit DEPLOY_NATIVE_EDGE_READY=1 acknowledgment and staged cutover documentation.
  • +
  • Native worker runtime has not yet been exercised live against the existing Docker worker stack. Mitigation: follow-up issue to soak worker-only native units before any default-runtime decision.
  • +
  • The known untracked Signal CLI tarball remains in the repo checkout. This is already tolerated by the deploy helper allowlist and was not changed here.
  • +
+
+ +
+

Follow-up Work

+
    +
  • Open follow-up: islandflow-vvw — stage native public-edge cutover after worker soak.
  • +
  • Decide whether native should ever replace Docker as the default runtime only after worker soak data and deliberate edge cutover validation.
  • +
+
+
+ + diff --git a/plans/2026-05-18-native-fast-iterative-deploy-plan.md b/plans/2026-05-18-native-fast-iterative-deploy-plan.md new file mode 100644 index 0000000..0e09102 --- /dev/null +++ b/plans/2026-05-18-native-fast-iterative-deploy-plan.md @@ -0,0 +1,21 @@ +# Native, Fast, Iterative Deployment Plan (Docker Optional) + +Date: 2026-05-18 + +## Plan Steps (15) + +1. ☐ Stop the bleeding immediately (current deploy loop). +2. ☐ Get hard timing data per deploy phase. +3. ☐ Live server state audit (when plan mode is off). +4. ☐ Resolve duplicate compose stack first (islandflow-2db). +5. ☐ Fix NPM proxy route regression (islandflow-sz8). +6. ☐ Define target iterative deployment model. +7. ☐ Prepare native runtime prerequisites on VPS. +8. ☐ Checked-in native ops assets (islandflow-38p). +9. ☐ Switch proxy topology for native mode carefully. +10. ☐ Deploy strategy guardrails. +11. ☐ Validation matrix. +12. ☐ Staged cutover plan. +13. ☐ Decision: final default runtime. +14. ☐ Decision: optimization priority. +15. ☐ Decision: immediate live audit kickoff. diff --git a/scripts/deploy.ts b/scripts/deploy.ts index 68d260a..043122e 100644 --- a/scripts/deploy.ts +++ b/scripts/deploy.ts @@ -7,7 +7,7 @@ import { fileURLToPath } from "node:url"; type DeployMode = "main" | "current-branch"; type DeployRuntime = "docker" | "native"; -type DeployScope = "full" | "web" | "api" | "services"; +type DeployScope = "full" | "web" | "api" | "services" | "workers"; type DeployOptions = { mode: DeployMode; @@ -18,10 +18,18 @@ type DeployOptions = { noBuild: boolean; }; +type PhaseTiming = { + name: string; + durationMs: number; +}; + const REMOTE_HOST = "delta@152.53.80.229"; const REMOTE_REPO = "/home/delta/islandflow"; const REMOTE_DOCKER_DEPLOYMENT = "/home/delta/islandflow/deployment/docker"; -const SSH_KEY = path.join(process.env.HOME ?? "", ".ssh", "delta_ed25519"); +const SSH_KEY = + process.env.DEPLOY_SSH_KEY_PATH?.trim() || + path.join(process.env.HOME ?? "", ".ssh", "delta_ed25519"); +const DEPLOY_FORCE_SSH = process.env.DEPLOY_FORCE_SSH?.trim() === "1"; const SSH_OPTIONS = [ "-i", SSH_KEY, @@ -38,6 +46,7 @@ const PUBLIC_APP_URL = const PUBLIC_API_HEALTH_URL = process.env.DEPLOY_PUBLIC_API_HEALTH_URL?.trim() || null; const DEPLOY_GIT_REMOTE_OVERRIDE = process.env.DEPLOY_GIT_REMOTE?.trim() || null; +const DEPLOY_NATIVE_EDGE_READY = process.env.DEPLOY_NATIVE_EDGE_READY?.trim() === "1"; const NATIVE_SYSTEMCTL_PREFIX = process.env.DEPLOY_NATIVE_SYSTEMCTL_PREFIX?.trim() || "sudo -n systemctl"; const NATIVE_UNITS = { @@ -65,15 +74,22 @@ const DOCKER_BACKEND_SERVICES = [ "ingest-options", "ingest-equities" ] as const; +const DOCKER_WORKER_SERVICES = [ + "compute", + "candles", + "ingest-options", + "ingest-equities" +] as const; const scriptPath = fileURLToPath(import.meta.url); const repoRoot = path.resolve(path.dirname(scriptPath), ".."); +const isLocalServerExecution = !DEPLOY_FORCE_SSH && repoRoot === REMOTE_REPO; function usage(exitCode = 1): never { console.error(`Usage: - ./deploy main [--runtime docker|native] [--web-only|--api-only|--services-only] [--fast] [--no-build] [--force-recreate] - ./deploy current-branch [--runtime docker|native] [--web-only|--api-only|--services-only] [--fast] [--no-build] [--force-recreate] - ./deploy current branch [--runtime docker|native] [--web-only|--api-only|--services-only] [--fast] [--no-build] [--force-recreate] + ./deploy main [--runtime docker|native] [--web-only|--api-only|--services-only|--workers-only] [--fast] [--no-build] [--force-recreate] + ./deploy current-branch [--runtime docker|native] [--web-only|--api-only|--services-only|--workers-only] [--fast] [--no-build] [--force-recreate] + ./deploy current branch [--runtime docker|native] [--web-only|--api-only|--services-only|--workers-only] [--fast] [--no-build] [--force-recreate] Modes: main Deploy /main to the live server checkout. @@ -88,18 +104,22 @@ Scopes: --web-only Deploy only the Next.js web surface. --api-only Deploy only the API service. --services-only Deploy API + backend services without the web service. + --workers-only Deploy compute/candles/ingest workers without touching web or API. Options: --runtime Explicit runtime selector (docker or native). - --fast Prefer a quicker rollout profile (defaults full scope to --services-only and skips public API route suite). + --fast Prefer a quicker rollout profile (defaults full scope to --services-only for docker and --workers-only for native, and skips the public API route suite when API scope is included). --no-build Skip docker image builds or native bun install/web build steps. --force-recreate Docker-only escalation path for docker compose when a normal refresh is not enough. --help Show this help text. Environment: DEPLOY_GIT_REMOTE Override git remote used for deploy fetch/pull/push (auto-detected by default). + DEPLOY_SSH_KEY_PATH Override the SSH key used for remote execution. + DEPLOY_FORCE_SSH Set to 1 to force SSH even when running from the live server checkout. DEPLOY_PUBLIC_APP_URL Override the public app URL (default: https://flow.deltaisland.io). DEPLOY_PUBLIC_API_HEALTH_URL Optional separate public API health URL for two-origin deployments. + DEPLOY_NATIVE_EDGE_READY Set to 1 to allow native rollouts that include the public web or API edge. DEPLOY_NATIVE_SYSTEMCTL_PREFIX Override systemctl invocation for native rollouts (default: sudo -n systemctl). DEPLOY_NATIVE_WEB_UNIT Override native web systemd unit name. DEPLOY_NATIVE_API_UNIT Override native api systemd unit name. @@ -114,6 +134,32 @@ function section(title: string): void { console.log(`\n== ${title} ==`); } +function formatDuration(durationMs: number): string { + if (durationMs < 1000) { + return `${durationMs}ms`; + } + + return `${(durationMs / 1000).toFixed(2)}s`; +} + +function timedPhase(timings: PhaseTiming[], name: string, fn: () => T): T { + const startedAt = Date.now(); + try { + return fn(); + } finally { + timings.push({ name, durationMs: Date.now() - startedAt }); + } +} + +function printTimingSummary(timings: PhaseTiming[]): void { + section("Deploy Timings"); + const totalMs = timings.reduce((sum, timing) => sum + timing.durationMs, 0); + for (const timing of timings) { + console.log(`[deploy] ${timing.name}: ${formatDuration(timing.durationMs)}`); + } + console.log(`[deploy] total: ${formatDuration(totalMs)}`); +} + function formatCommand(command: string, args: string[]): string { return [command, ...args] .map((part) => (/\s/.test(part) ? JSON.stringify(part) : part)) @@ -180,6 +226,23 @@ function runRemoteScript( args: string[] = [] ): void { section(title); + + if (isLocalServerExecution) { + const localArgs = ["-s", "--", ...args]; + console.log(`$ ${formatCommand("bash", localArgs)} # local server execution`); + const result = spawnSync("bash", localArgs, { + cwd: repoRoot, + input: script, + encoding: "utf8", + stdio: ["pipe", "inherit", "inherit"] + }); + + if (result.status !== 0) { + process.exit(result.status ?? 1); + } + return; + } + const sshArgs = [...SSH_OPTIONS, REMOTE_HOST, "bash", "-s", "--", ...args]; console.log(`$ ${formatCommand("ssh", sshArgs)}`); const result = spawnSync("ssh", sshArgs, { @@ -221,11 +284,14 @@ function parseScope(rawArgs: string[]): DeployScope { const scopes = [ rawArgs.includes("--web-only") ? "web" : null, rawArgs.includes("--api-only") ? "api" : null, - rawArgs.includes("--services-only") ? "services" : null + rawArgs.includes("--services-only") ? "services" : null, + rawArgs.includes("--workers-only") ? "workers" : null ].filter((value): value is Exclude => value !== null); if (scopes.length > 1) { - console.error("Choose only one deploy scope flag: --web-only, --api-only, or --services-only."); + console.error( + "Choose only one deploy scope flag: --web-only, --api-only, --services-only, or --workers-only." + ); process.exit(1); } @@ -250,6 +316,7 @@ function parseArgs(rawArgs: string[]): DeployOptions { arg !== "--web-only" && arg !== "--api-only" && arg !== "--services-only" && + arg !== "--workers-only" && arg !== "--runtime" && rawArgs[index - 1] !== "--runtime" && !arg.startsWith("--runtime=") @@ -282,8 +349,13 @@ function parseArgs(rawArgs: string[]): DeployOptions { } function assertSshKeyExists(): void { + if (isLocalServerExecution) { + return; + } + if (!existsSync(SSH_KEY)) { console.error(`Missing SSH key: ${SSH_KEY}`); + console.error("Set DEPLOY_SSH_KEY_PATH or run from the live server checkout without DEPLOY_FORCE_SSH."); process.exit(1); } } @@ -398,14 +470,16 @@ function describeScope(scope: DeployScope): string { return "api only"; case "services": return "api + backend services"; + case "workers": + return "worker services only"; default: return "full stack"; } } -function effectiveScope(scope: DeployScope, fast: boolean): DeployScope { +function effectiveScope(scope: DeployScope, runtime: DeployRuntime, fast: boolean): DeployScope { if (fast && scope === "full") { - return "services"; + return runtime === "native" ? "workers" : "services"; } return scope; } @@ -418,6 +492,10 @@ function scopeIncludesApi(scope: DeployScope): boolean { return scope === "full" || scope === "api" || scope === "services"; } +function scopeTouchesPublicEdge(scope: DeployScope): boolean { + return scopeIncludesWeb(scope) || scopeIncludesApi(scope); +} + function dockerServicesForScope(scope: DeployScope): string[] { switch (scope) { case "web": @@ -426,6 +504,8 @@ function dockerServicesForScope(scope: DeployScope): string[] { return ["api"]; case "services": return [...DOCKER_BACKEND_SERVICES]; + case "workers": + return [...DOCKER_WORKER_SERVICES]; default: return []; } @@ -448,6 +528,8 @@ function dockerLogServicesForScope(scope: DeployScope): string[] { return ["api"]; case "services": return [...DOCKER_BACKEND_SERVICES]; + case "workers": + return [...DOCKER_WORKER_SERVICES]; default: return [...DOCKER_CORE_SERVICES]; } @@ -467,6 +549,13 @@ function nativeUnitsForScope(scope: DeployScope): string[] { NATIVE_UNITS.ingestOptions, NATIVE_UNITS.ingestEquities ]; + case "workers": + return [ + NATIVE_UNITS.compute, + NATIVE_UNITS.candles, + NATIVE_UNITS.ingestOptions, + NATIVE_UNITS.ingestEquities + ]; default: return [ NATIVE_UNITS.web, @@ -494,19 +583,46 @@ function localDockerWorkspaceSnapshotPrecheck(): void { } } -function localRuntimePrecheck(runtime: DeployRuntime, noBuild: boolean): void { +function assertNativeEdgeReady(scope: DeployScope): void { + if (!scopeTouchesPublicEdge(scope) || DEPLOY_NATIVE_EDGE_READY) { + return; + } + + console.error( + "Refusing native deploy that touches public web/API scope before edge cutover is acknowledged." + ); + console.error( + "Set DEPLOY_NATIVE_EDGE_READY=1 only after proxy routing and native units for the public edge are intentionally prepared." + ); + console.error( + "For fast iterative backend deploys before cutover, use --runtime native --workers-only or --runtime native --fast." + ); + process.exit(1); +} + +function localRuntimePrecheck(runtime: DeployRuntime, scope: DeployScope, noBuild: boolean): void { if (runtime === "docker" && !noBuild) { localDockerWorkspaceSnapshotPrecheck(); + return; + } + + if (runtime === "native") { + assertNativeEdgeReady(scope); } } -function localMainPrecheck(remote: string, runtime: DeployRuntime, noBuild: boolean): void { +function localMainPrecheck( + remote: string, + runtime: DeployRuntime, + scope: DeployScope, + noBuild: boolean +): void { section("Local Precheck"); runChecked("git", ["fetch", remote]); runChecked("git", ["status", "--short", "--branch"]); runChecked("git", ["rev-parse", "--verify", "HEAD"]); runChecked("git", ["rev-parse", `${remote}/main`]); - localRuntimePrecheck(runtime, noBuild); + localRuntimePrecheck(runtime, scope, noBuild); } function currentBranchName(): string { @@ -522,6 +638,7 @@ function localBranchPrecheck( remote: string, branch: string, runtime: DeployRuntime, + scope: DeployScope, noBuild: boolean ): void { section("Local Precheck"); @@ -537,7 +654,7 @@ function localBranchPrecheck( process.exit(1); } - localRuntimePrecheck(runtime, noBuild); + localRuntimePrecheck(runtime, scope, noBuild); } function publishCurrentBranch(remote: string, branch: string): void { @@ -861,7 +978,8 @@ function publicVerification(scope: DeployScope, fast: boolean): void { function main(): void { const options = parseArgs(process.argv.slice(2)); - const scope = effectiveScope(options.scope, options.fast); + const scope = effectiveScope(options.scope, options.runtime, options.fast); + const timings: PhaseTiming[] = []; const currentBranch = options.mode === "current-branch" ? currentBranchName() : null; const deployRemote = resolveDeployRemote(options.mode, currentBranch); assertSshKeyExists(); @@ -872,22 +990,33 @@ function main(): void { `via ${describeRuntime(options.runtime)} (${describeScope(scope)}${options.fast ? ", fast mode" : ""}).` ); console.log(`[deploy] Using git remote: ${deployRemote}`); + console.log( + `[deploy] Execution mode: ${isLocalServerExecution ? "local server checkout" : `ssh to ${REMOTE_HOST}`}` + ); if (options.fast && options.scope === "full") { - console.log("[deploy] Fast mode changed default full scope to --services-only."); + console.log( + `[deploy] Fast mode changed default full scope to ${options.runtime === "native" ? "--workers-only" : "--services-only"}.` + ); } if (options.mode === "main") { - localMainPrecheck(deployRemote, options.runtime, options.noBuild); - remoteGitPrecheck(); - remoteRuntimePrecheck(options.runtime, scope); - remoteRollout( - options.mode, - deployRemote, - options.runtime, - null, - scope, - options.forceRecreate, - options.noBuild + timedPhase(timings, "local precheck", () => + localMainPrecheck(deployRemote, options.runtime, scope, options.noBuild) + ); + timedPhase(timings, "remote git precheck", () => remoteGitPrecheck()); + timedPhase(timings, "remote runtime precheck", () => + remoteRuntimePrecheck(options.runtime, scope) + ); + timedPhase(timings, "remote rollout", () => + remoteRollout( + options.mode, + deployRemote, + options.runtime, + null, + scope, + options.forceRecreate, + options.noBuild + ) ); } else { const branch = currentBranch; @@ -895,23 +1024,34 @@ function main(): void { console.error("Unable to resolve current branch for current-branch deploy mode."); process.exit(1); } - localBranchPrecheck(deployRemote, branch, options.runtime, options.noBuild); - publishCurrentBranch(deployRemote, branch); - remoteGitPrecheck(); - remoteRuntimePrecheck(options.runtime, scope); - remoteRollout( - options.mode, - deployRemote, - options.runtime, - branch, - scope, - options.forceRecreate, - options.noBuild + timedPhase(timings, "local precheck", () => + localBranchPrecheck(deployRemote, branch, options.runtime, scope, options.noBuild) + ); + timedPhase(timings, "local publish", () => publishCurrentBranch(deployRemote, branch)); + timedPhase(timings, "remote git precheck", () => remoteGitPrecheck()); + timedPhase(timings, "remote runtime precheck", () => + remoteRuntimePrecheck(options.runtime, scope) + ); + timedPhase(timings, "remote rollout", () => + remoteRollout( + options.mode, + deployRemote, + options.runtime, + branch, + scope, + options.forceRecreate, + options.noBuild + ) ); } - remoteVerification(options.runtime, scope, options.fast); - publicVerification(scope, options.fast); + timedPhase(timings, "remote verification", () => + remoteVerification(options.runtime, scope, options.fast) + ); + timedPhase(timings, "public verification", () => + publicVerification(scope, options.fast) + ); + printTimingSummary(timings); } main(); From bdb9d9a95a10e2df736d412a0083004e69752eee Mon Sep 17 00:00:00 2001 From: dirtydishes Date: Mon, 18 May 2026 19:55:27 -0400 Subject: [PATCH 2/2] Implement native public edge cutover --- .beads/issues.jsonl | 5 +- apps/web/package.json | 2 +- deployment/docker/.dockerignore | 23 + deployment/docker/.env.example | 2 + deployment/docker/Dockerfile.web | 2 +- deployment/docker/README.md | 11 +- deployment/docker/docker-compose.yml | 18 +- deployment/native/README.md | 61 +- deployment/native/bootstrap-infra.sh | 24 + deployment/native/check-native-health.sh | 7 + deployment/native/check-native-infra.sh | 24 + .../native/config/clickhouse-listen.xml | 6 + deployment/native/config/redis.conf | 10 + deployment/native/cutover.sh | 34 ++ deployment/native/full-rollback.sh | 27 + deployment/native/install-infra-units.sh | 72 +++ deployment/native/start-infra.sh | 17 + deployment/native/stop-infra.sh | 9 + deployment/native/switch-npm-edge.sh | 285 ++++++++++ .../system/islandflow-clickhouse.service | 17 + .../systemd/system/islandflow-nats.service | 18 + .../systemd/system/islandflow-redis.service | 18 + .../systemd/user/islandflow-api.service | 2 + .../user/islandflow-ingest-options.service | 1 + .../systemd/user/islandflow-web.service | 4 +- ...2026-05-18-native-public-edge-cutover.html | 521 ++++++++++++++++++ packages/bus/src/jetstream.ts | 14 +- scripts/deploy.ts | 8 +- services/api/src/index.ts | 4 +- 29 files changed, 1215 insertions(+), 31 deletions(-) create mode 100644 deployment/docker/.dockerignore create mode 100755 deployment/native/bootstrap-infra.sh create mode 100755 deployment/native/check-native-infra.sh create mode 100644 deployment/native/config/clickhouse-listen.xml create mode 100644 deployment/native/config/redis.conf create mode 100755 deployment/native/cutover.sh create mode 100755 deployment/native/full-rollback.sh create mode 100755 deployment/native/install-infra-units.sh create mode 100755 deployment/native/start-infra.sh create mode 100755 deployment/native/stop-infra.sh create mode 100755 deployment/native/switch-npm-edge.sh create mode 100644 deployment/native/systemd/system/islandflow-clickhouse.service create mode 100644 deployment/native/systemd/system/islandflow-nats.service create mode 100644 deployment/native/systemd/system/islandflow-redis.service create mode 100644 docs/turns/2026-05-18-native-public-edge-cutover.html diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 16eabf1..00b065c 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -1,4 +1,4 @@ -{"_type":"issue","id":"islandflow-9rc","title":"Implement native fast iterative deploy plan","description":"Implement the checked-in plan at plans/2026-05-18-native-fast-iterative-deploy-plan.md. Cover deploy-phase timing instrumentation, native deployment operational assets, deploy guardrails, validation/cutover documentation, and any required live VPS remediation that is safely actionable from this session. Track follow-up items separately if anything cannot be completed in-repo or on the live host.","status":"in_progress","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-18T07:15:19Z","created_by":"dirtydishes","updated_at":"2026-05-18T07:15:25Z","started_at":"2026-05-18T07:15:25Z","dependency_count":0,"dependent_count":0,"comment_count":0} +{"_type":"issue","id":"islandflow-9rc","title":"Implement native fast iterative deploy plan","description":"Implement the checked-in plan at plans/2026-05-18-native-fast-iterative-deploy-plan.md. Cover deploy-phase timing instrumentation, native deployment operational assets, deploy guardrails, validation/cutover documentation, and any required live VPS remediation that is safely actionable from this session. Track follow-up items separately if anything cannot be completed in-repo or on the live host.","status":"closed","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-18T07:15:19Z","created_by":"dirtydishes","updated_at":"2026-05-18T07:34:03Z","started_at":"2026-05-18T07:15:25Z","closed_at":"2026-05-18T07:34:03Z","close_reason":"Implemented the native fast iterative deploy plan with deploy timing summaries, worker-only native fast mode, edge-cutover guardrails, local-on-server execution support, checked-in native ops assets, live audit findings, and turn documentation. Remaining cutover work is tracked in islandflow-vvw.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-8kj","title":"Configure persistent beads Dolt remote on deltaisland server","description":"Install the beads and Dolt CLIs on the server, configure a persistent Dolt sync remote backed by the server-hosted Forgejo repository, verify refs/dolt/data publication, and document Nginx Proxy Manager / firewall considerations.","status":"closed","priority":1,"issue_type":"task","assignee":"delta","created_at":"2026-05-17T10:31:31Z","created_by":"delta","updated_at":"2026-05-17T10:37:47Z","started_at":"2026-05-17T10:32:16Z","closed_at":"2026-05-17T10:37:47Z","close_reason":"Installed bd and dolt on the server, configured the Forgejo-backed Dolt remote, published refs/dolt/data, and documented the setup.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-200","title":"Implement durable options tape history","description":"Implement the plan from docs/plans/2026-05-16-1711-durable-options-tape-history.html: durable ClickHouse-backed options history, signal/all prints view selection, preserved execution context, stale semantics limited to live health, reset runbook, tests, and turn documentation.","status":"closed","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-16T21:21:30Z","created_by":"dirtydishes","updated_at":"2026-05-16T21:26:51Z","started_at":"2026-05-16T21:21:33Z","closed_at":"2026-05-16T21:26:51Z","close_reason":"Implemented durable options tape history, signal/raw view selection, reset runbook, tests, and turn documentation.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-k4f","title":"Gate deploy script on docker workspace snapshot sync","description":"Prevent frozen-lockfile build failures during deploy by adding a local preflight in scripts/deploy.ts that runs bun run check:docker-workspace and aborts with a clear sync+commit remediation message when stale.","status":"closed","priority":1,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-15T23:01:44Z","created_by":"dirtydishes","updated_at":"2026-05-15T23:04:11Z","started_at":"2026-05-15T23:01:48Z","closed_at":"2026-05-15T23:04:11Z","close_reason":"Closed","dependency_count":0,"dependent_count":0,"comment_count":0} @@ -13,7 +13,8 @@ {"_type":"issue","id":"islandflow-ayo","title":"Drop stale backlog events from live fanout","description":"Follow-up to live freshness rollout: /ws/live was still fanning out stale backlog events for freshness-gated channels, which kept tape panes in Live feed behind despite active synthetic ingest. Gate fanout and cache ingest by freshness for options/nbbo/equities/flow.","status":"closed","priority":1,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T21:26:39Z","created_by":"dirtydishes","updated_at":"2026-04-28T21:26:44Z","started_at":"2026-04-28T21:26:44Z","closed_at":"2026-04-28T21:26:44Z","close_reason":"Completed","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-0v6","title":"Fix tape freshness, NBBO coverage, pause controls, and filter popup","description":"Implement the tape fixes requested for synthetic options notional sizing, strict live freshness, live-mode pause/resume behavior, stronger NBBO snapshot coverage, and moving flow filters behind a popup. Includes server-side live cache changes, web terminal state/UI changes, and tests for synthetic pricing, live snapshot freshness/NBBO retention, and live pause/filter interactions.","status":"closed","priority":1,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T21:02:52Z","created_by":"dirtydishes","updated_at":"2026-04-28T21:13:38Z","started_at":"2026-04-28T21:02:57Z","closed_at":"2026-04-28T21:13:38Z","close_reason":"Completed","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-e4r","title":"Implement smart-money flow filtering and synthetic firehose modes","description":"Implement the approved multi-surface plan for named synthetic market profiles, options raw-vs-signal filtering, live/API filter contracts, Tape page client-side flow filters, firehose-readiness improvements, tests, and README updates.","status":"closed","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T20:10:49Z","created_by":"dirtydishes","updated_at":"2026-04-28T20:29:29Z","started_at":"2026-04-28T20:10:53Z","closed_at":"2026-04-28T20:29:29Z","close_reason":"Implemented synthetic market profiles, options signal-path filtering, signal-aware API/replay contracts, Tape page filters, tests, and README updates. Follow-up tracked in islandflow-biq.","dependency_count":0,"dependent_count":0,"comment_count":0} -{"_type":"issue","id":"islandflow-vvw","title":"Stage native public-edge cutover after worker soak","description":"Why this issue exists and what needs to be done:\\n- The native deploy path is now provisioned for worker-first iteration, with checked-in user units, rollback helpers, and edge guardrails\\n- Remaining work is to enable and soak native worker units, validate duplicate-processing behavior, then deliberately cut over the public web/api edge if warranted\\n- Final acceptance should include deciding whether Docker or native becomes the default runtime after operational evidence","status":"open","priority":2,"issue_type":"task","owner":"dishes@dpdrm.com","created_at":"2026-05-18T07:32:35Z","created_by":"dirtydishes","updated_at":"2026-05-18T07:32:35Z","dependency_count":0,"dependent_count":0,"comment_count":0} +{"_type":"issue","id":"islandflow-fl5","title":"Decide final public posture for api.flow.deltaisland.io after native cutover","description":"Why this issue exists and what needs to be done:\\n- Native cutover now works end-to-end through Nginx Proxy Manager and the public API hostname now resolves directly to the VPS\\n- The API hostname was left DNS-only in Cloudflare during incident resolution, while the web hostname still uses the Cloudflare proxy\\n- We need to decide whether api.flow.deltaisland.io should remain direct-to-origin or be re-proxied through Cloudflare, then validate TLS, websocket, and operational behavior for the chosen posture","status":"open","priority":2,"issue_type":"task","owner":"dishes@dpdrm.com","created_at":"2026-05-18T23:51:21Z","created_by":"dirtydishes","updated_at":"2026-05-18T23:51:21Z","dependencies":[{"issue_id":"islandflow-fl5","depends_on_id":"islandflow-vvw","type":"discovered-from","created_at":"2026-05-18T19:52:32Z","created_by":"dirtydishes","metadata":"{}"}],"dependency_count":0,"dependent_count":0,"comment_count":0} +{"_type":"issue","id":"islandflow-vvw","title":"Stage native public-edge cutover after worker soak","description":"Why this issue exists and what needs to be done:\\n- The native deploy path is now provisioned for worker-first iteration, with checked-in user units, rollback helpers, and edge guardrails\\n- Remaining work is to enable and soak native worker units, validate duplicate-processing behavior, then deliberately cut over the public web/api edge if warranted\\n- Final acceptance should include deciding whether Docker or native becomes the default runtime after operational evidence","notes":"2026-05-18: native infra, native app services, NPM public-edge retargeting, Docker rollback helpers, and Cloudflare/DNS API hostname recovery were implemented and verified. Public checks now pass for flow.deltaisland.io and api.flow.deltaisland.io. Remaining follow-up: decide whether api.flow.deltaisland.io should remain DNS-only or be re-proxied through Cloudflare under islandflow-fl5.","status":"in_progress","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-18T07:32:35Z","created_by":"dirtydishes","updated_at":"2026-05-18T23:52:32Z","started_at":"2026-05-18T23:51:20Z","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-bsg","title":"Fix public /replay/options proxy regression","description":"Restore correct public routing for GET /replay/options on flow.deltaisland.io. The app currently serves HTML for that API path, which indicates edge/proxy routing drift. Update the live proxy topology or deployment assets as needed, then validate with bun run scripts/check-public-api-routes.ts.","status":"closed","priority":2,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-18T07:15:19Z","created_by":"dirtydishes","updated_at":"2026-05-18T07:32:51Z","started_at":"2026-05-18T07:15:24Z","closed_at":"2026-05-18T07:32:51Z","close_reason":"Audited the live VPS and reverse proxy on 2026-05-18: public /replay/options now returns JSON, bun run scripts/check-public-api-routes.ts passes, and the active Nginx Proxy Manager config includes /replay in the API route matcher. No in-repo app code change was required.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-9j5","title":"Prepare PR for deploy allowlist cleanup","description":"Why this issue exists and what needs to be done:\\n- Package current deploy allowlist cleanup into a reviewable PR with multiple commits\\n- Add required turn documentation in docs/turns\\n- Run validation and push all artifacts","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-17T15:44:12Z","created_by":"dirtydishes","updated_at":"2026-05-17T15:53:55Z","started_at":"2026-05-17T15:44:22Z","closed_at":"2026-05-17T15:53:55Z","close_reason":"Packaged deploy allowlist cleanup into multi-commit PR branch with required turn documentation and push workflow.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-0sa","title":"Fix live tape auto-hold, history seam, and remove manual pause control","description":"The live tape should automatically hold when the user scrolls away from the top, resume when they return to the top or use Jump to top, and keep older prints available seamlessly beyond the hot window. Manual Pause/Resume control is now redundant and should be removed from live tape panes. This work should also fix the current regression where paused/held tapes still mutate, and align the options tape with a strict 100-row hot head backed by ClickHouse history.","notes":"Implemented live scroll-hold with no live pause button, demand-loaded ClickHouse history, a 100-row options hot head, and cache-first scoped snapshots. Validated with bun test apps/web/app/terminal.test.ts services/api/tests/live.test.ts and bun --cwd=apps/web run build.","status":"closed","priority":2,"issue_type":"bug","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-16T18:12:51Z","created_by":"dirtydishes","updated_at":"2026-05-16T18:23:43Z","started_at":"2026-05-16T18:12:54Z","closed_at":"2026-05-16T18:23:43Z","close_reason":"Closed","dependency_count":0,"dependent_count":0,"comment_count":0} diff --git a/apps/web/package.json b/apps/web/package.json index 8ab6906..91611ea 100644 --- a/apps/web/package.json +++ b/apps/web/package.json @@ -5,7 +5,7 @@ "scripts": { "dev": "bun run scripts/dev.ts", "build": "next build", - "start": "next start -p 3000" + "start": "next start" }, "dependencies": { "@islandflow/types": "workspace:*", diff --git a/deployment/docker/.dockerignore b/deployment/docker/.dockerignore new file mode 100644 index 0000000..8fd5de7 --- /dev/null +++ b/deployment/docker/.dockerignore @@ -0,0 +1,23 @@ +.git +.github +.DS_Store +.bun +.tmp +node_modules +dist +coverage +logs +apps/web/.next +.env +.env.* +session-ses_*.md +token-usage-output.txt +signal-cli-*.tar.gz +*.tar +*.tar.gz +*.tgz +*.zip +__pycache__ +.pytest_cache +!.env.example +!**/.env.example diff --git a/deployment/docker/.env.example b/deployment/docker/.env.example index eee9cef..1a3eb84 100644 --- a/deployment/docker/.env.example +++ b/deployment/docker/.env.example @@ -4,8 +4,10 @@ NATS_URL=nats://nats:4222 CLICKHOUSE_URL=http://clickhouse:8123 CLICKHOUSE_DATABASE=default REDIS_URL=redis://redis:6379 +ISLANDFLOW_DATA_ROOT=/var/lib/islandflow API_PORT=4000 +API_HOST=0.0.0.0 API_BIND_IP=127.0.0.1 API_HOST_PORT=4000 WEB_BIND_IP=127.0.0.1 diff --git a/deployment/docker/Dockerfile.web b/deployment/docker/Dockerfile.web index 33723ae..efd186b 100644 --- a/deployment/docker/Dockerfile.web +++ b/deployment/docker/Dockerfile.web @@ -59,4 +59,4 @@ COPY --from=build /app/packages ./packages EXPOSE 3000 -CMD ["bun", "run", "--cwd", "apps/web", "start"] +CMD ["bun", "run", "--cwd", "apps/web", "start", "--", "-H", "0.0.0.0", "-p", "3000"] diff --git a/deployment/docker/README.md b/deployment/docker/README.md index ed80c53..9b36220 100644 --- a/deployment/docker/README.md +++ b/deployment/docker/README.md @@ -2,12 +2,12 @@ This directory contains the Docker runtime for Islandflow VPS deployments. -Docker remains the default and recommended server rollout path, but the repo-root `deploy` helper can now target either: +Docker remains the default rollout path before native cutover and the rollback path after cutover. The repo-root `deploy` helper can target either: - `--runtime docker` for this Docker Compose stack -- `--runtime native` for an experimental host-native Bun + systemd rollout described in `deployment/native/README.md` +- `--runtime native` for the host-native Bun + systemd rollout described in `deployment/native/README.md` -The repo no longer ships or supports a separate `deployment/npm` stack. If you want a reverse proxy, point it at the host ports published by this stack. +The public VPS edge remains Nginx Proxy Manager. Docker fallback can be reached either through the shared Docker network service names or the host ports published by this stack. It is separate from the repo-root `docker-compose.yml`, which remains the lightweight local infra stack for development. @@ -17,7 +17,7 @@ Do not run the repo-root `docker-compose.yml` on the VPS. On the live server tha - Builds and runs the full Islandflow stack with Docker Compose. - Publishes `web` and `api` to host ports, bound to loopback by default. -- Runs ClickHouse, Redis, and NATS JetStream with persistent Docker volumes. +- Runs ClickHouse, Redis, and NATS JetStream with persistent host data under `ISLANDFLOW_DATA_ROOT`. - Runs the core runtime services: `ingest-options`, `ingest-equities`, `compute`, `candles`, `api`, and `web`. - Keeps `replay` opt-in through a Compose profile, because the current replay service starts immediately when the container is enabled. @@ -56,6 +56,7 @@ cp .env.example .env Important defaults: - `NATS_URL`, `CLICKHOUSE_URL`, and `REDIS_URL` should stay on the internal container hostnames unless you intentionally split infra out. +- `ISLANDFLOW_DATA_ROOT=/var/lib/islandflow` matches the native infra data root used by the VPS cutover helpers. - `OPTIONS_INGEST_ADAPTER=synthetic` and `EQUITIES_INGEST_ADAPTER=synthetic` are the safest first-boot settings. - `WEB_BIND_IP=127.0.0.1` and `API_BIND_IP=127.0.0.1` keep the published ports local to the host by default. - `WEB_HOST_PORT=3000` and `API_HOST_PORT=4000` control the host-side published ports. @@ -213,7 +214,7 @@ BuildKit cache mounts require a modern Docker Engine with Dockerfile frontend su ## Safe rollouts on `152.53.80.229` -The current live VPS uses Nginx Proxy Manager on the shared Docker network and routes public traffic to the Docker `web` and `api` containers by container name. Because of that, this Docker path remains the operationally correct default for the live server today. +The current live VPS uses Nginx Proxy Manager as the outer edge. Before native cutover, NPM routes Islandflow traffic to Docker service names. During cutover, `deployment/native/switch-npm-edge.sh native` retargets only the Islandflow proxy hosts to the NPM bridge gateway IP so NPM can reach native host ports. If needed, override the detected target with `ISLANDFLOW_NATIVE_HOST=`. The deploy helper also warns if it detects a second compose project named `islandflow` on the server, because that usually means the repo-root local-infra stack was started on the VPS by mistake. diff --git a/deployment/docker/docker-compose.yml b/deployment/docker/docker-compose.yml index 96598ba..1fbf251 100644 --- a/deployment/docker/docker-compose.yml +++ b/deployment/docker/docker-compose.yml @@ -42,6 +42,8 @@ services: init: true expose: - "3000" + ports: + - "${WEB_BIND_IP:-127.0.0.1}:${WEB_HOST_PORT:-3000}:3000" networks: - default - shared @@ -64,8 +66,13 @@ services: api: <<: *service-common command: ["services/api/src/index.ts"] + environment: + LOG_LEVEL: ${LOG_LEVEL:-warn} + API_HOST: 0.0.0.0 expose: - "4000" + ports: + - "${API_BIND_IP:-127.0.0.1}:${API_HOST_PORT:-4000}:4000" networks: - default - shared @@ -128,7 +135,7 @@ services: soft: 262144 hard: 262144 volumes: - - clickhouse-data:/var/lib/clickhouse + - ${ISLANDFLOW_DATA_ROOT:-/var/lib/islandflow}/clickhouse:/var/lib/clickhouse - ./clickhouse/listen.xml:/etc/clickhouse-server/config.d/listen.xml:ro healthcheck: test: @@ -146,7 +153,7 @@ services: restart: unless-stopped command: ["redis-server", "--appendonly", "yes"] volumes: - - redis-data:/data + - ${ISLANDFLOW_DATA_ROOT:-/var/lib/islandflow}/redis:/data healthcheck: test: [ @@ -164,14 +171,9 @@ services: restart: unless-stopped command: ["-js", "-sd", "/data"] volumes: - - nats-data:/data + - ${ISLANDFLOW_DATA_ROOT:-/var/lib/islandflow}/nats:/data networks: shared: external: true name: ${NPM_SHARED_NETWORK:-npm-shared} - -volumes: - clickhouse-data: - redis-data: - nats-data: diff --git a/deployment/native/README.md b/deployment/native/README.md index 4e2dd52..c421c51 100644 --- a/deployment/native/README.md +++ b/deployment/native/README.md @@ -9,12 +9,14 @@ This directory documents the host-native Islandflow rollout path used by: ## Current operating model -Native runtime is now intended for **fast iterative backend deploys first**, while Docker remains the supported public production edge until a deliberate cutover is completed. +Native runtime is now intended for a phased VPS cutover. Docker remains the supported rollback runtime, but Docker and native app services must not own the same Islandflow scope at the same time because the workers and API use durable JetStream consumers. Today, the recommended split is: -- **Docker runtime** for the live public `web` + `api` path -- **Native runtime** for worker-only iteration (`compute`, `candles`, `ingest-options`, `ingest-equities`) +- **Nginx Proxy Manager** remains the public `:80/:443` edge +- **Native system services** own NATS, Redis, and ClickHouse after infra cutover +- **Native user services** own `web`, `api`, and workers after app cutover +- **Docker Compose** remains available as the rollback runtime - local development stays: - Docker infra: `bun run dev:infra` - native backend services: `bun run dev:services` @@ -47,6 +49,38 @@ That means native worker deploy support is now provisioned on the host, but nati ## Checked-in native ops assets +### Infra system units + +Checked-in system service units and config live under: + +- `deployment/native/systemd/system/islandflow-nats.service` +- `deployment/native/systemd/system/islandflow-redis.service` +- `deployment/native/systemd/system/islandflow-clickhouse.service` +- `deployment/native/config/redis.conf` +- `deployment/native/config/clickhouse-listen.xml` + +Install and start them on the VPS with: + +```bash +./deployment/native/bootstrap-infra.sh +``` + +Or install and start manually: + +```bash +sudo ./deployment/native/install-infra-units.sh +sudo ./deployment/native/start-infra.sh +./deployment/native/check-native-infra.sh +``` + +The native infra services bind to loopback and use stable host data paths: + +- NATS JetStream: `/var/lib/islandflow/nats` +- Redis: `/var/lib/islandflow/redis` +- ClickHouse: `/var/lib/islandflow/clickhouse` + +The Docker fallback compose file uses the same `ISLANDFLOW_DATA_ROOT` default of `/var/lib/islandflow`, so rollback can preserve durable state when only one runtime is active. + ### User unit templates Checked-in unit files live under: @@ -89,10 +123,29 @@ Install script behavior: This validates: +- native infra health for `full`, `api`, `services`, and `workers` - `systemctl --user is-active` for the selected units - local API health at `http://127.0.0.1:4000/health` when API scope is included - local web health at `http://127.0.0.1:3000/` when web scope is included +### App cutover and edge switch helpers + +```bash +./deployment/native/cutover.sh full +./deployment/native/switch-npm-edge.sh native +./deployment/native/full-rollback.sh +``` + +The edge switch helper updates the Nginx Proxy Manager database entries for `flow.deltaisland.io` and `api.flow.deltaisland.io`, preserving the same-origin Islandflow API location matcher: + +```nginx +^/(ws|replay|prints|joins|nbbo|dark|flow|candles|history)/ +``` + +For native cutover, the helper targets the NPM bridge gateway IP by default, not `host.docker.internal`. NPM generates `proxy_pass` with a runtime-resolved `$server` variable, so Docker's `/etc/hosts` alias is not sufficient for these proxy hosts. On the current VPS that native target resolves to `172.18.0.1`, which reaches the host-native `3000` and `4000` listeners from the NPM container. + +Switching back to Docker restores upstreams to the Compose service names `web:3000` and `api:4000`. + ### Rollback helper ```bash @@ -184,7 +237,7 @@ Without that variable, these commands are refused: - `./deploy main --runtime native --api-only` - `./deploy main --runtime native --services-only` -This keeps the native path focused on safe worker iteration until proxy routing and public unit ownership are switched deliberately. +This keeps native app ownership explicit until infra, app health, and proxy routing are switched deliberately. ## Running deploy from the VPS itself diff --git a/deployment/native/bootstrap-infra.sh b/deployment/native/bootstrap-infra.sh new file mode 100755 index 0000000..dfc3422 --- /dev/null +++ b/deployment/native/bootstrap-infra.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +if [[ "${EUID}" -eq 0 ]]; then + "$repo_root/deployment/native/install-infra-units.sh" +else + sudo "$repo_root/deployment/native/install-infra-units.sh" +fi + +echo "Stopping Docker Islandflow services before native infra opens durable data." +( + cd "$repo_root/deployment/docker" + docker compose stop web api compute candles ingest-options ingest-equities nats redis clickhouse +) + +if [[ "${EUID}" -eq 0 ]]; then + "$repo_root/deployment/native/start-infra.sh" +else + sudo "$repo_root/deployment/native/start-infra.sh" +fi + +"$repo_root/deployment/native/check-native-infra.sh" diff --git a/deployment/native/check-native-health.sh b/deployment/native/check-native-health.sh index 1d070e5..13582bc 100755 --- a/deployment/native/check-native-health.sh +++ b/deployment/native/check-native-health.sh @@ -2,6 +2,7 @@ set -euo pipefail scope="${1:-full}" +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" units=() case "$scope" in @@ -27,6 +28,12 @@ case "$scope" in ;; esac +case "$scope" in + full|api|services|workers) + "$repo_root/deployment/native/check-native-infra.sh" + ;; +esac + for unit in "${units[@]}"; do systemctl --user is-active --quiet "$unit" echo "ok $unit" diff --git a/deployment/native/check-native-infra.sh b/deployment/native/check-native-infra.sh new file mode 100755 index 0000000..bfdc998 --- /dev/null +++ b/deployment/native/check-native-infra.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +systemctl is-active --quiet islandflow-nats.service +echo "ok islandflow-nats.service" + +systemctl is-active --quiet islandflow-redis.service +echo "ok islandflow-redis.service" + +systemctl is-active --quiet islandflow-clickhouse.service +echo "ok islandflow-clickhouse.service" + +if command -v redis-cli >/dev/null 2>&1; then + redis-cli -h 127.0.0.1 -p 6379 ping | grep -q PONG +else + timeout 2 bash -c ' + 127.0.0.1 + /var/lib/islandflow/clickhouse/ + /var/lib/islandflow/clickhouse/tmp/ + /var/lib/islandflow/clickhouse/user_files/ + diff --git a/deployment/native/config/redis.conf b/deployment/native/config/redis.conf new file mode 100644 index 0000000..8a39ba6 --- /dev/null +++ b/deployment/native/config/redis.conf @@ -0,0 +1,10 @@ +bind 127.0.0.1 +protected-mode yes +port 6379 +dir /var/lib/islandflow/redis +appendonly yes +save 900 1 +save 300 10 +save 60 10000 +loglevel notice +databases 16 diff --git a/deployment/native/cutover.sh b/deployment/native/cutover.sh new file mode 100755 index 0000000..fcff377 --- /dev/null +++ b/deployment/native/cutover.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail + +scope="${1:-full}" +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +case "$scope" in + full|services|workers|api|web) + ;; + *) + echo "Usage: deployment/native/cutover.sh [full|services|workers|api|web]" >&2 + exit 1 + ;; +esac + +echo "Stopping Docker-owned Islandflow app services before native ownership starts." +( + cd "$repo_root/deployment/docker" + docker compose stop web api compute candles ingest-options ingest-equities +) + +if [[ "$scope" == "full" || "$scope" == "services" || "$scope" == "api" || "$scope" == "web" ]]; then + "$repo_root/deployment/native/check-native-infra.sh" +fi + +systemctl --user restart $(case "$scope" in + full) echo islandflow-web.service islandflow-api.service islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service ;; + services) echo islandflow-api.service islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service ;; + workers) echo islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service ;; + api) echo islandflow-api.service ;; + web) echo islandflow-web.service ;; +esac) + +"$repo_root/deployment/native/check-native-health.sh" "$scope" diff --git a/deployment/native/full-rollback.sh b/deployment/native/full-rollback.sh new file mode 100755 index 0000000..77a78af --- /dev/null +++ b/deployment/native/full-rollback.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +echo "Stopping native app services." +systemctl --user stop islandflow-web.service islandflow-api.service islandflow-compute.service islandflow-candles.service islandflow-ingest-options.service islandflow-ingest-equities.service || true + +echo "Stopping native infra before Docker reopens durable data." +if [[ "${EUID}" -eq 0 ]]; then + systemctl stop islandflow-nats.service islandflow-redis.service islandflow-clickhouse.service || true +else + sudo systemctl stop islandflow-nats.service islandflow-redis.service islandflow-clickhouse.service || true +fi + +echo "Switching NPM Islandflow upstreams back to Docker service names." +"$repo_root/deployment/native/switch-npm-edge.sh" docker + +echo "Restarting Docker Islandflow runtime." +( + cd "$repo_root/deployment/docker" + docker compose up -d web api compute candles ingest-options ingest-equities +) + +curl -I -fksS "${DEPLOY_PUBLIC_APP_URL:-https://flow.deltaisland.io}" >/dev/null +curl -fksS "${DEPLOY_PUBLIC_API_HEALTH_URL:-https://api.flow.deltaisland.io/health}" >/dev/null +echo "Rollback validation passed." diff --git a/deployment/native/install-infra-units.sh b/deployment/native/install-infra-units.sh new file mode 100755 index 0000000..2a9ab85 --- /dev/null +++ b/deployment/native/install-infra-units.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +system_unit_source_dir="$repo_root/deployment/native/systemd/system" +config_source_dir="$repo_root/deployment/native/config" + +if [[ "${EUID}" -ne 0 ]]; then + echo "Run as root: sudo $0" >&2 + exit 1 +fi + +resolve_binary() { + local name="$1" + local path="" + + path="$(command -v "$name" 2>/dev/null || true)" + if [[ -n "$path" ]]; then + printf '%s\n' "$path" + return 0 + fi + + for candidate in "/usr/bin/$name" "/usr/sbin/$name" "/usr/local/bin/$name" "/usr/local/sbin/$name"; do + if [[ -x "$candidate" ]]; then + printf '%s\n' "$candidate" + return 0 + fi + done + + return 1 +} + +missing=() +for command in nats-server redis-server clickhouse-server; do + if ! resolve_binary "$command" >/dev/null; then + missing+=("$command") + fi +done + +if [[ ${#missing[@]} -gt 0 ]]; then + echo "Missing native infra binaries: ${missing[*]}" >&2 + echo "Install NATS Server, Redis Server, and ClickHouse Server before bootstrapping native infra." >&2 + echo "On Debian, Redis is usually available as redis-server; ClickHouse and NATS may require their vendor repositories or packaged binaries." >&2 + exit 1 +fi + +ensure_system_user() { + local name="$1" + local home="$2" + + getent group "$name" >/dev/null || groupadd --system "$name" + getent passwd "$name" >/dev/null || useradd --system --gid "$name" --home-dir "$home" --shell /usr/sbin/nologin "$name" +} + +ensure_system_user nats /var/lib/islandflow/nats +ensure_system_user redis /var/lib/islandflow/redis +ensure_system_user clickhouse /var/lib/islandflow/clickhouse + +install -d -m 0755 /etc/islandflow +install -m 0644 "$config_source_dir/redis.conf" /etc/islandflow/redis.conf +install -d -m 0755 /etc/clickhouse-server/config.d +install -m 0644 "$config_source_dir/clickhouse-listen.xml" /etc/clickhouse-server/config.d/islandflow-listen.xml + +install -d -o nats -g nats -m 0750 /var/lib/islandflow/nats +install -d -o redis -g redis -m 0750 /var/lib/islandflow/redis +install -d -o clickhouse -g clickhouse -m 0750 /var/lib/islandflow/clickhouse + +install -m 0644 "$system_unit_source_dir"/islandflow-*.service /etc/systemd/system/ +systemctl daemon-reload + +echo "Installed native infra system units and config." +echo "Start infra with: sudo deployment/native/start-infra.sh" diff --git a/deployment/native/start-infra.sh b/deployment/native/start-infra.sh new file mode 100755 index 0000000..8f78791 --- /dev/null +++ b/deployment/native/start-infra.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ "${EUID}" -ne 0 ]]; then + echo "Run as root: sudo $0" >&2 + exit 1 +fi + +for unit in redis-server.service nats-server.service clickhouse-server.service; do + if systemctl list-unit-files "$unit" >/dev/null 2>&1; then + systemctl disable --now "$unit" >/dev/null 2>&1 || true + fi +done + +systemctl reset-failed islandflow-nats.service islandflow-redis.service islandflow-clickhouse.service || true +systemctl enable --now islandflow-nats.service islandflow-redis.service islandflow-clickhouse.service +"$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/check-native-infra.sh" diff --git a/deployment/native/stop-infra.sh b/deployment/native/stop-infra.sh new file mode 100755 index 0000000..91a488d --- /dev/null +++ b/deployment/native/stop-infra.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ "${EUID}" -ne 0 ]]; then + echo "Run as root: sudo $0" >&2 + exit 1 +fi + +systemctl stop islandflow-nats.service islandflow-redis.service islandflow-clickhouse.service diff --git a/deployment/native/switch-npm-edge.sh b/deployment/native/switch-npm-edge.sh new file mode 100755 index 0000000..c9fcd93 --- /dev/null +++ b/deployment/native/switch-npm-edge.sh @@ -0,0 +1,285 @@ +#!/usr/bin/env bash +set -euo pipefail + +target="${1:-native}" +npm_root="${NPM_ROOT:-/home/delta/nginx-proxy-manager}" +db_path="${NPM_DB_PATH:-$npm_root/data/database.sqlite}" +app_domain="${ISLANDFLOW_APP_DOMAIN:-flow.deltaisland.io}" +api_domain="${ISLANDFLOW_API_DOMAIN:-api.flow.deltaisland.io}" +native_host="${ISLANDFLOW_NATIVE_HOST:-}" +docker_web_host="${ISLANDFLOW_DOCKER_WEB_HOST:-web}" +docker_api_host="${ISLANDFLOW_DOCKER_API_HOST:-api}" +web_port="${ISLANDFLOW_WEB_PORT:-3000}" +api_port="${ISLANDFLOW_API_PORT:-4000}" +restart_npm="${NPM_RESTART:-1}" +npm_container="${NPM_CONTAINER_NAME:-nginx-proxy-manager}" +sudo_cmd=() + +case "$target" in + native|docker) + ;; + *) + echo "Usage: deployment/native/switch-npm-edge.sh [native|docker]" >&2 + exit 1 + ;; +esac + +resolve_native_host() { + if [[ -n "$native_host" ]]; then + printf '%s\n' "$native_host" + return + fi + + if command -v docker >/dev/null 2>&1 && docker ps --format '{{.Names}}' | grep -qx "$npm_container"; then + native_host="$(docker inspect "$npm_container" --format '{{range .NetworkSettings.Networks}}{{println .Gateway}}{{end}}' | sed '/^$/d' | head -n1)" + if [[ -n "$native_host" ]]; then + printf '%s\n' "$native_host" + return + fi + fi + + echo "Unable to determine the native upstream host for NPM." >&2 + echo "Set ISLANDFLOW_NATIVE_HOST explicitly or start the $npm_container container first." >&2 + exit 1 +} + +if [[ "$target" == "native" ]]; then + native_host="$(resolve_native_host)" +fi + +if [[ ! -w "$db_path" || ! -w "$(dirname "$db_path")" ]]; then + if [[ "${EUID}" -eq 0 ]]; then + sudo_cmd=() + elif command -v sudo >/dev/null 2>&1; then + sudo_cmd=(sudo) + else + echo "NPM database path is not writable and sudo is unavailable: $db_path" >&2 + exit 1 + fi +fi + +if [[ ! -f "$db_path" ]]; then + echo "NPM database not found: $db_path" >&2 + exit 1 +fi + +backup="$db_path.before-islandflow-$target-$(date +%Y%m%d%H%M%S)" +"${sudo_cmd[@]}" cp "$db_path" "$backup" +echo "Backed up NPM database to $backup" + +"${sudo_cmd[@]}" python3 - "$db_path" "$target" "$app_domain" "$api_domain" "$native_host" "$docker_web_host" "$docker_api_host" "$web_port" "$api_port" <<'PY' +import json +import sqlite3 +import sys + +db_path, target, app_domain, api_domain, native_host, docker_web_host, docker_api_host, web_port, api_port = sys.argv[1:] +web_host = native_host if target == "native" else docker_web_host +api_host = native_host if target == "native" else docker_api_host + +advanced_config = f"""location ~ ^/(ws|replay|prints|joins|nbbo|dark|flow|candles|history)/ {{ + set $forward_scheme http; + set $server "{api_host}"; + set $port {api_port}; + + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $http_connection; + proxy_http_version 1.1; + + include conf.d/include/proxy.conf; +}}""" + +def has_domain(raw, domain): + try: + return domain in json.loads(raw) + except Exception: + return domain in raw + +con = sqlite3.connect(db_path) +cur = con.cursor() +rows = list(cur.execute("select id, domain_names from proxy_host where is_deleted = 0")) +app_ids = [row_id for row_id, domains in rows if has_domain(domains, app_domain)] +api_ids = [row_id for row_id, domains in rows if has_domain(domains, api_domain)] + +if len(app_ids) != 1 or len(api_ids) != 1: + raise SystemExit(f"Expected one app and one API proxy host, found app={app_ids} api={api_ids}") + +cur.execute( + "update proxy_host set forward_scheme = 'http', forward_host = ?, forward_port = ?, allow_websocket_upgrade = 1, advanced_config = ?, modified_on = datetime('now') where id = ?", + (web_host, int(web_port), advanced_config, app_ids[0]), +) +cur.execute( + "update proxy_host set forward_scheme = 'http', forward_host = ?, forward_port = ?, allow_websocket_upgrade = 1, modified_on = datetime('now') where id = ?", + (api_host, int(api_port), api_ids[0]), +) +con.commit() +print(f"Updated {app_domain} -> {web_host}:{web_port}") +print(f"Updated {api_domain} -> {api_host}:{api_port}") +PY + +if command -v python3 >/dev/null 2>&1; then + "${sudo_cmd[@]}" python3 - "$npm_root" "$db_path" "$target" "$app_domain" "$api_domain" "$native_host" "$docker_web_host" "$docker_api_host" "$web_port" "$api_port" <<'PY' +import json +import re +import sqlite3 +import sys +from pathlib import Path + +( + npm_root, + db_path, + target, + app_domain, + api_domain, + native_host, + docker_web_host, + docker_api_host, + web_port, + api_port, +) = sys.argv[1:] + +web_host = native_host if target == "native" else docker_web_host +api_host = native_host if target == "native" else docker_api_host + +def has_domain(raw, domain): + try: + return domain in json.loads(raw) + except Exception: + return domain in raw + +def replace_nth(text, pattern, replacement, index): + matches = list(pattern.finditer(text)) + if len(matches) < index: + raise SystemExit(f"Unable to rewrite generated proxy config; expected match {index} for {pattern.pattern!r}") + match = matches[index - 1] + return text[:match.start()] + replacement(match) + text[match.end():] + +server_pattern = re.compile(r'^(?P\s*set \$server\s+)".*?";\s*$', re.M) +port_pattern = re.compile(r'^(?P\s*set \$port\s+)\d+;\s*$', re.M) + +def replace_server(text, host, index): + return replace_nth(text, server_pattern, lambda m: f'{m.group("prefix")}"{host}";', index) + +def replace_port(text, port, index): + return replace_nth(text, port_pattern, lambda m: f'{m.group("prefix")}{port};', index) + +con = sqlite3.connect(db_path) +rows = list(con.execute("select id, domain_names from proxy_host where is_deleted = 0")) +app_ids = [row_id for row_id, domains in rows if has_domain(domains, app_domain)] +api_ids = [row_id for row_id, domains in rows if has_domain(domains, api_domain)] +if len(app_ids) != 1 or len(api_ids) != 1: + raise SystemExit(f"Expected one app and one API proxy host, found app={app_ids} api={api_ids}") + +api_conf = Path(npm_root) / "data/nginx/proxy_host" / f"{api_ids[0]}.conf" +app_conf = Path(npm_root) / "data/nginx/proxy_host" / f"{app_ids[0]}.conf" + +if api_conf.exists(): + text = api_conf.read_text() + text = replace_server(text, api_host, 1) + text = replace_port(text, int(api_port), 1) + api_conf.write_text(text) + print(f"Synchronized {api_conf.name} -> {api_host}:{api_port}") + +if app_conf.exists(): + text = app_conf.read_text() + text = replace_server(text, web_host, 1) + text = replace_port(text, int(web_port), 1) + text = replace_server(text, api_host, 2) + text = replace_port(text, int(api_port), 2) + app_conf.write_text(text) + print(f"Synchronized {app_conf.name} -> {web_host}:{web_port} and API matcher -> {api_host}:{api_port}") +PY +fi + +if [[ "$restart_npm" == "0" ]]; then + echo "NPM container restart skipped because NPM_RESTART=0." +elif command -v docker >/dev/null 2>&1 && docker ps --format '{{.Names}}' | grep -qx nginx-proxy-manager; then + docker restart nginx-proxy-manager >/dev/null + echo "Restarted nginx-proxy-manager" +else + echo "NPM container restart skipped; restart it manually if it is not managed by Docker on this host." +fi + +if command -v docker >/dev/null 2>&1 && docker ps --format '{{.Names}}' | grep -qx "$npm_container"; then + "${sudo_cmd[@]}" python3 - "$npm_root" "$db_path" "$target" "$app_domain" "$api_domain" "$native_host" "$docker_web_host" "$docker_api_host" "$web_port" "$api_port" <<'PY' +import json +import re +import sqlite3 +import sys +from pathlib import Path + +( + npm_root, + db_path, + target, + app_domain, + api_domain, + native_host, + docker_web_host, + docker_api_host, + web_port, + api_port, +) = sys.argv[1:] + +web_host = native_host if target == "native" else docker_web_host +api_host = native_host if target == "native" else docker_api_host + +def has_domain(raw, domain): + try: + return domain in json.loads(raw) + except Exception: + return domain in raw + +def replace_nth(text, pattern, replacement, index): + matches = list(pattern.finditer(text)) + if len(matches) < index: + raise SystemExit(f"Unable to rewrite generated proxy config; expected match {index} for {pattern.pattern!r}") + match = matches[index - 1] + return text[:match.start()] + replacement(match) + text[match.end():] + +server_pattern = re.compile(r'^(?P\s*set \$server\s+)".*?";\s*$', re.M) +port_pattern = re.compile(r'^(?P\s*set \$port\s+)\d+;\s*$', re.M) + +def replace_server(text, host, index): + return replace_nth(text, server_pattern, lambda m: f'{m.group("prefix")}"{host}";', index) + +def replace_port(text, port, index): + return replace_nth(text, port_pattern, lambda m: f'{m.group("prefix")}{port};', index) + +con = sqlite3.connect(db_path) +rows = list(con.execute("select id, domain_names from proxy_host where is_deleted = 0")) +app_ids = [row_id for row_id, domains in rows if has_domain(domains, app_domain)] +api_ids = [row_id for row_id, domains in rows if has_domain(domains, api_domain)] +if len(app_ids) != 1 or len(api_ids) != 1: + raise SystemExit(f"Expected one app and one API proxy host, found app={app_ids} api={api_ids}") + +api_conf = Path(npm_root) / "data/nginx/proxy_host" / f"{api_ids[0]}.conf" +app_conf = Path(npm_root) / "data/nginx/proxy_host" / f"{app_ids[0]}.conf" + +if api_conf.exists(): + text = api_conf.read_text() + text = replace_server(text, api_host, 1) + text = replace_port(text, int(api_port), 1) + api_conf.write_text(text) + +if app_conf.exists(): + text = app_conf.read_text() + text = replace_server(text, web_host, 1) + text = replace_port(text, int(web_port), 1) + text = replace_server(text, api_host, 2) + text = replace_port(text, int(api_port), 2) + app_conf.write_text(text) +PY + reloaded=0 + for _ in 1 2 3 4 5; do + if docker exec "$npm_container" nginx -s reload >/dev/null 2>&1; then + reloaded=1 + break + fi + sleep 1 + done + if [[ "$reloaded" == "1" ]]; then + echo "Reloaded nginx-proxy-manager" + else + echo "Warning: nginx-proxy-manager reload did not succeed after restart; verify the container is healthy." >&2 + fi +fi diff --git a/deployment/native/systemd/system/islandflow-clickhouse.service b/deployment/native/systemd/system/islandflow-clickhouse.service new file mode 100644 index 0000000..79f8ed2 --- /dev/null +++ b/deployment/native/systemd/system/islandflow-clickhouse.service @@ -0,0 +1,17 @@ +[Unit] +Description=Islandflow ClickHouse +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/usr/bin/env clickhouse-server --config-file=/etc/clickhouse-server/config.xml +Restart=always +RestartSec=5 +User=clickhouse +Group=clickhouse +StateDirectory=clickhouse +LimitNOFILE=262144 + +[Install] +WantedBy=multi-user.target diff --git a/deployment/native/systemd/system/islandflow-nats.service b/deployment/native/systemd/system/islandflow-nats.service new file mode 100644 index 0000000..a23eefc --- /dev/null +++ b/deployment/native/systemd/system/islandflow-nats.service @@ -0,0 +1,18 @@ +[Unit] +Description=Islandflow NATS JetStream +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/usr/sbin/nats-server -js -sd /var/lib/islandflow/nats -a 127.0.0.1 -p 4222 -m 8222 +Restart=always +RestartSec=2 +User=nats +Group=nats +RuntimeDirectory=islandflow-nats +StateDirectory=islandflow/nats +LimitNOFILE=1048576 + +[Install] +WantedBy=multi-user.target diff --git a/deployment/native/systemd/system/islandflow-redis.service b/deployment/native/systemd/system/islandflow-redis.service new file mode 100644 index 0000000..3e63d74 --- /dev/null +++ b/deployment/native/systemd/system/islandflow-redis.service @@ -0,0 +1,18 @@ +[Unit] +Description=Islandflow Redis +After=network-online.target +Wants=network-online.target + +[Service] +Type=notify +ExecStart=/usr/bin/env redis-server /etc/islandflow/redis.conf --supervised systemd --daemonize no +Restart=always +RestartSec=2 +User=redis +Group=redis +RuntimeDirectory=islandflow-redis +StateDirectory=islandflow/redis +LimitNOFILE=65535 + +[Install] +WantedBy=multi-user.target diff --git a/deployment/native/systemd/user/islandflow-api.service b/deployment/native/systemd/user/islandflow-api.service index 5a74500..1e6cc99 100644 --- a/deployment/native/systemd/user/islandflow-api.service +++ b/deployment/native/systemd/user/islandflow-api.service @@ -6,6 +6,8 @@ Wants=network-online.target [Service] Type=simple WorkingDirectory=/home/delta/islandflow +Environment=API_HOST=0.0.0.0 +Environment=API_PORT=4000 EnvironmentFile=/home/delta/islandflow/.env ExecStart=/home/delta/.bun/bin/bun services/api/src/index.ts Restart=always diff --git a/deployment/native/systemd/user/islandflow-ingest-options.service b/deployment/native/systemd/user/islandflow-ingest-options.service index eac0a6c..10107b1 100644 --- a/deployment/native/systemd/user/islandflow-ingest-options.service +++ b/deployment/native/systemd/user/islandflow-ingest-options.service @@ -7,6 +7,7 @@ Wants=network-online.target Type=simple WorkingDirectory=/home/delta/islandflow EnvironmentFile=/home/delta/islandflow/.env +Environment=OPTIONS_INGEST_ADAPTER=synthetic ExecStart=/home/delta/.bun/bin/bun services/ingest-options/src/index.ts Restart=always RestartSec=2 diff --git a/deployment/native/systemd/user/islandflow-web.service b/deployment/native/systemd/user/islandflow-web.service index 6e79177..ce75e0b 100644 --- a/deployment/native/systemd/user/islandflow-web.service +++ b/deployment/native/systemd/user/islandflow-web.service @@ -6,8 +6,10 @@ Wants=network-online.target [Service] Type=simple WorkingDirectory=/home/delta/islandflow +Environment=WEB_HOST=0.0.0.0 +Environment=WEB_PORT=3000 EnvironmentFile=/home/delta/islandflow/.env -ExecStart=/home/delta/.bun/bin/bun --cwd apps/web run start +ExecStart=/bin/sh -lc 'cd /home/delta/islandflow/apps/web && exec /home/delta/.bun/bin/bun x next start -H "$WEB_HOST" -p "$WEB_PORT"' Restart=always RestartSec=2 KillSignal=SIGINT diff --git a/docs/turns/2026-05-18-native-public-edge-cutover.html b/docs/turns/2026-05-18-native-public-edge-cutover.html new file mode 100644 index 0000000..8d2d2b1 --- /dev/null +++ b/docs/turns/2026-05-18-native-public-edge-cutover.html @@ -0,0 +1,521 @@ + + + + + + Turn Document - Native Public Edge Cutover + + + +
+
+
Islandflow Turn Document
+

Native Public Edge Cutover

+

+ Completed the VPS native-first cutover for Islandflow infrastructure and app services while keeping Nginx + Proxy Manager as the outer edge and Docker as the rollback path. The final state now serves + flow.deltaisland.io and api.flow.deltaisland.io from the native web and API + processes, with verified public routing and a documented follow-up for the long-term API Cloudflare posture. +

+
+
+
Generated
+
2026-05-18 19:52 EDT
+
+
+
Primary Issue
+
islandflow-vvw
+
+
+
Follow-up
+
islandflow-fl5
+
+
+
Runtime State
+
Native active, Docker retained for rollback
+
+
+
+ +
+

Summary

+

+ The repository now contains the native infra units, native cutover scripts, Docker fallback adjustments, and + public-edge retargeting logic required to run Islandflow natively on the VPS. During validation, the live NPM + edge was switched from Docker container-name upstreams to native host ports, the host firewall was adjusted so + the NPM bridge could reach the native API, and the separate public API TLS problem was resolved by correcting + the Cloudflare DNS state for api.flow.deltaisland.io. +

+
+ +
+

Changes Made

+
    +
  • + Added checked-in native infra operations under deployment/native/, including + bootstrap-infra.sh, check-native-infra.sh, cutover.sh, + full-rollback.sh, start-infra.sh, and the native system units for NATS, Redis, + and ClickHouse. +
  • +
  • + Extended native app runtime units so the web and API bind on host-reachable interfaces, and forced the + native options ingest service to use the synthetic adapter during the cutover. +
  • +
  • + Updated services/api to support explicit host binding through API_HOST, and fixed + JetStream retention conversion in packages/bus so native services can start cleanly with the + configured max-age values. +
  • +
  • + Updated the Docker fallback assets to publish loopback web/API ports, share durable host data under + /var/lib/islandflow, and document the native-to-Docker rollback path. +
  • +
  • + Reworked deployment/native/switch-npm-edge.sh so it targets the NPM bridge gateway IP instead + of host.docker.internal, handles the root-owned NPM SQLite database, synchronizes generated + proxy_host configs, and reloads NPM deterministically after the edge switch. +
  • +
  • + Created Beads follow-up issue islandflow-fl5 for the remaining decision about whether + api.flow.deltaisland.io should remain DNS-only or be re-proxied through Cloudflare. +
  • +
+
+ +
+

Context

+

+ The migration started from a Docker-owned production baseline where NATS, Redis, ClickHouse, API, workers, and + web all ran in Compose, while NPM routed Islandflow traffic to Docker service names. That setup blocked a safe + native cutover for two reasons: the native services could not reach Docker-only infra reliably, and NPM could + not send public traffic to host-native processes without a deliberate upstream retarget. +

+

+ The runtime model for this work is exclusive ownership. Native and Docker are not allowed to run the same API + or worker scopes in parallel because JetStream durable consumers would conflict. The objective was therefore a + phased handoff, not a mixed soak for the same queues. +

+
+ +
+

Important Implementation Details

+
+
+

NPM edge targeting

+

+ NPM generates proxy_pass from a runtime-resolved $server variable, so the + Docker /etc/hosts alias for host.docker.internal was not sufficient. The switch + helper now detects the NPM bridge gateway and uses that IP for native upstreams. +

+
+
+

Firewall path

+

+ The host UFW policy already allowed port 3000 but not 4000. The live fix was a + source-scoped allow for the NPM bridge subnet so the containerized edge could reach the native API. +

+
+
+

Cloudflare API hostname

+

+ The API hostname failure was separate from the native cutover. The hostname is now a DNS-only + A record pointing at the VPS, which restored public TLS and health responses. +

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AreaImplementation detail
Native API + services/api/src/index.ts now accepts API_HOST and passes it to + Bun.serve. The native unit sets API_HOST=0.0.0.0 and + API_PORT=4000. +
Native web + The native web unit now starts from apps/web with + bun x next start -H "$WEB_HOST" -p "$WEB_PORT", avoiding the earlier repo-root startup + failure and binding the service on 0.0.0.0:3000. +
JetStream retention + Native startup exposed a retention-unit bug. The shared bus layer now converts stream max-age values with + nanos(...) and formats them back with millis(...). +
Docker fallback + Docker Compose now uses ISLANDFLOW_DATA_ROOT=/var/lib/islandflow, publishes loopback + ports, and keeps the fallback runtime compatible with the same durable data directories as the native + services. +
NPM switch helper + The helper now updates both the NPM database and the generated + /data/nginx/proxy_host/*.conf files, because a DB-only restart did not reliably rewrite the + live configs for Islandflow. +
+ +
sudo ufw allow proto tcp from 172.18.0.0/16 to any port 4000 comment 'npm bridge to native api'
+
+ +
+

Expected Impact for End-Users

+
    +
  • + Public web and API traffic now reaches the native Islandflow services, which removes Docker from the primary + live request path while keeping the outer edge unchanged. +
  • +
  • + Same-origin public API routes such as /prints, /history, /replay, + /nbbo, and /ws/live continue to resolve correctly through the main app hostname. +
  • +
  • + Rollback remains fast and explicit: NPM can be pointed back at Docker service names and the Docker runtime + can reclaim the same durable data directories if native operation needs to be abandoned. +
  • +
+
+ +
+

Validation

+
+
+
Static checks
+
    +
  • bun run check:docker-workspace
  • +
  • docker compose -f deployment/docker/docker-compose.yml config --quiet
  • +
  • docker compose -f /home/delta/nginx-proxy-manager/docker-compose.yml config --quiet
  • +
  • bash -n deployment/native/*.sh
  • +
  • systemd-analyze verify deployment/native/systemd/user/*.service deployment/native/systemd/system/*.service
  • +
  • bun build services/api/src/index.ts --target=bun
  • +
  • bun build scripts/deploy.ts --target=bun
  • +
+
+
+
Native runtime
+
    +
  • ./deployment/native/check-native-health.sh full
  • +
  • curl http://127.0.0.1:4000/health
  • +
  • curl -I http://127.0.0.1:3000/
  • +
+
+
+
Public edge
+
    +
  • curl -I -fksS https://flow.deltaisland.io
  • +
  • curl -fksS https://api.flow.deltaisland.io/health
  • +
  • bun run scripts/check-public-api-routes.ts https://flow.deltaisland.io
  • +
+
+
+
+ +
+

Issues, Limitations, and Mitigations

+
    +
  • + The native ingest-options service required an explicit synthetic-adapter override because the environment file + still pointed at an Alpaca adapter that was returning 401 responses. The service now starts + cleanly for native cutover, but production adapter selection remains an operational decision. +
  • +
  • + The NPM helper still relies on direct config synchronization because NPM did not reliably regenerate the + Islandflow proxy files from SQLite changes alone. This is mitigated by keeping the synchronization logic + checked in and by reloading NPM as part of the helper itself. +
  • +
  • + The final public API recovery currently leaves api.flow.deltaisland.io as a DNS-only hostname. + That restored service, but it changes the edge posture relative to the web hostname and should be reviewed + deliberately. +
  • +
  • + A temporary Cloudflare API token was used to inspect and correct zone state during validation. That token + should be rotated outside this repository workflow. +
  • +
+
+ +
+

Follow-up Work

+
    +
  • + islandflow-fl5: decide whether api.flow.deltaisland.io should remain DNS-only or + be re-proxied through Cloudflare, then re-validate TLS, websocket, and operational behavior for the chosen + posture. +
  • +
  • + After operational soak, decide whether native should become the default production runtime or remain a + supported alternative with Docker as the preferred steady-state runtime. +
  • +
+
+
+ + diff --git a/packages/bus/src/jetstream.ts b/packages/bus/src/jetstream.ts index 2eaf6a0..04bfa85 100644 --- a/packages/bus/src/jetstream.ts +++ b/packages/bus/src/jetstream.ts @@ -9,7 +9,9 @@ import { type StreamUpdateConfig, JSONCodec, type JsMsg, - createInbox + createInbox, + nanos, + millis } from "nats"; import { getKnownStreamDefinitions, getStreamDefinition, type StreamRetentionClass } from "./streams"; @@ -164,13 +166,13 @@ export const resolveStreamRetention = ( ): Pick => { if (streamClass === "raw") { return { - max_age: parseBoundedNumber(env.STREAM_RAW_MAX_AGE_MS, 3_600_000), + max_age: nanos(parseBoundedNumber(env.STREAM_RAW_MAX_AGE_MS, 3_600_000)), max_bytes: parseBoundedNumber(env.STREAM_RAW_MAX_BYTES, 536_870_912) }; } return { - max_age: parseBoundedNumber(env.STREAM_DERIVED_MAX_AGE_MS, 43_200_000), + max_age: nanos(parseBoundedNumber(env.STREAM_DERIVED_MAX_AGE_MS, 43_200_000)), max_bytes: parseBoundedNumber(env.STREAM_DERIVED_MAX_BYTES, 268_435_456) }; }; @@ -417,7 +419,7 @@ const formatBytes = (value: number): string => { }; const formatRetentionSummary = (config: StreamConfig): string => { - return `age=${formatDurationMs(Number(config.max_age))} bytes=${formatBytes(config.max_bytes)} replicas=${config.num_replicas} retention=${config.retention} discard=${config.discard}`; + return `age=${formatDurationMs(millis(Number(config.max_age)))} bytes=${formatBytes(config.max_bytes)} replicas=${config.num_replicas} retention=${config.retention} discard=${config.discard}`; }; const formatReportLine = ( @@ -442,12 +444,12 @@ const formatReportLine = ( const details = report.retentionDrift .map((delta) => { const desiredValue = delta.field === "max_age" - ? formatDurationMs(Number(delta.desired)) + ? formatDurationMs(millis(Number(delta.desired))) : delta.field === "max_bytes" ? formatBytes(Number(delta.desired)) : formatStructuredValue(delta.desired); const currentValue = delta.field === "max_age" - ? formatDurationMs(Number(delta.current)) + ? formatDurationMs(millis(Number(delta.current))) : delta.field === "max_bytes" ? formatBytes(Number(delta.current)) : formatStructuredValue(delta.current); diff --git a/scripts/deploy.ts b/scripts/deploy.ts index 043122e..e6f3a5c 100644 --- a/scripts/deploy.ts +++ b/scripts/deploy.ts @@ -920,6 +920,10 @@ function remoteNativeVerification(scope: DeployScope, fast: boolean): void { const units = nativeUnitsForScope(scope).map((value) => shellEscape(value)).join(" "); const checks: string[] = []; + if (scope === "full" || scope === "api" || scope === "services" || scope === "workers") { + checks.push("./deployment/native/check-native-infra.sh"); + } + if (scopeIncludesApi(scope)) { checks.push('curl -fksS http://127.0.0.1:4000/health'); } @@ -954,10 +958,10 @@ function remoteVerification(runtime: DeployRuntime, scope: DeployScope, fast: bo function publicVerification(scope: DeployScope, fast: boolean): void { section("Public Verification"); - if (!fast || scopeIncludesWeb(scope)) { + if (scopeIncludesWeb(scope)) { runChecked("curl", ["-I", "-fksS", PUBLIC_APP_URL]); } else { - console.log("[deploy] Fast mode: skipping public app HEAD check because web scope is not included."); + console.log("[deploy] Skipping public app HEAD check because web scope is not included."); } if (scopeIncludesApi(scope) && PUBLIC_API_HEALTH_URL) { diff --git a/services/api/src/index.ts b/services/api/src/index.ts index 433222a..41761a7 100644 --- a/services/api/src/index.ts +++ b/services/api/src/index.ts @@ -138,6 +138,7 @@ const DeliverPolicySchema = z.enum(["new", "all", "last", "last_per_subject"]); const envSchema = z.object({ API_PORT: z.coerce.number().int().positive().default(4000), + API_HOST: z.string().min(1).default("127.0.0.1"), NATS_URL: z.string().default("nats://127.0.0.1:4222"), CLICKHOUSE_URL: z.string().default("http://127.0.0.1:8123"), CLICKHOUSE_DATABASE: z.string().default("default"), @@ -1313,6 +1314,7 @@ const run = async () => { }; const server = Bun.serve({ + hostname: env.API_HOST, port: env.API_PORT, fetch: async (req: Request, serverRef: any) => { const url = new URL(req.url); @@ -1995,7 +1997,7 @@ const run = async () => { } }); - logger.info("api listening", { port: server.port }); + logger.info("api listening", { host: env.API_HOST, port: server.port }); const shutdown = async (signal: string) => { if (state.shutdownPromise) {