Add flow packet clustering

This commit is contained in:
dirtydishes 2025-12-27 20:25:32 -05:00
parent a21d513f32
commit 6c376b26dc
8 changed files with 347 additions and 11 deletions

View file

@ -2,3 +2,5 @@ export const STREAM_OPTION_PRINTS = "OPTIONS_PRINTS";
export const SUBJECT_OPTION_PRINTS = "options.prints"; export const SUBJECT_OPTION_PRINTS = "options.prints";
export const STREAM_EQUITY_PRINTS = "EQUITY_PRINTS"; export const STREAM_EQUITY_PRINTS = "EQUITY_PRINTS";
export const SUBJECT_EQUITY_PRINTS = "equities.prints"; export const SUBJECT_EQUITY_PRINTS = "equities.prints";
export const STREAM_FLOW_PACKETS = "FLOW_PACKETS";
export const SUBJECT_FLOW_PACKETS = "flow.packets";

View file

@ -1,6 +1,6 @@
import { createClient, type ClickHouseClient } from "@clickhouse/client"; import { createClient, type ClickHouseClient } from "@clickhouse/client";
import { EquityPrintSchema, OptionPrintSchema } from "@islandflow/types"; import { EquityPrintSchema, FlowPacketSchema, OptionPrintSchema } from "@islandflow/types";
import type { EquityPrint, OptionPrint } from "@islandflow/types"; import type { EquityPrint, FlowPacket, OptionPrint } from "@islandflow/types";
import { import {
normalizeOptionPrint, normalizeOptionPrint,
optionPrintsTableDDL, optionPrintsTableDDL,
@ -11,6 +11,13 @@ import {
EQUITY_PRINTS_TABLE, EQUITY_PRINTS_TABLE,
normalizeEquityPrint normalizeEquityPrint
} from "./equity-prints"; } from "./equity-prints";
import {
FLOW_PACKETS_TABLE,
flowPacketsTableDDL,
fromFlowPacketRecord,
toFlowPacketRecord,
type FlowPacketRecord
} from "./flow-packets";
export type ClickHouseOptions = { export type ClickHouseOptions = {
url: string; url: string;
@ -44,6 +51,14 @@ export const ensureEquityPrintsTable = async (
}); });
}; };
export const ensureFlowPacketsTable = async (
client: ClickHouseClient
): Promise<void> => {
await client.exec({
query: flowPacketsTableDDL()
});
};
export const insertOptionPrint = async ( export const insertOptionPrint = async (
client: ClickHouseClient, client: ClickHouseClient,
print: OptionPrint print: OptionPrint
@ -68,6 +83,18 @@ export const insertEquityPrint = async (
}); });
}; };
export const insertFlowPacket = async (
client: ClickHouseClient,
packet: FlowPacket
): Promise<void> => {
const record = toFlowPacketRecord(packet);
await client.insert({
table: FLOW_PACKETS_TABLE,
values: [record],
format: "JSONEachRow"
});
};
const clampLimit = (limit: number): number => { const clampLimit = (limit: number): number => {
if (!Number.isFinite(limit)) { if (!Number.isFinite(limit)) {
return 100; return 100;
@ -149,6 +176,26 @@ const normalizeEquityRow = (row: unknown): unknown => {
return row; return row;
}; };
const normalizeFlowPacketRow = (row: unknown): FlowPacketRecord | null => {
if (!row || typeof row !== "object") {
return null;
}
const record = row as Record<string, unknown>;
return {
source_ts: coerceNumber(record.source_ts) as number,
ingest_ts: coerceNumber(record.ingest_ts) as number,
seq: coerceNumber(record.seq) as number,
trace_id: String(record.trace_id ?? ""),
id: String(record.id ?? ""),
members: Array.isArray(record.members)
? record.members.map((value) => String(value))
: [],
features_json: String(record.features_json ?? "{}"),
join_quality_json: String(record.join_quality_json ?? "{}")
};
};
export const fetchRecentOptionPrints = async ( export const fetchRecentOptionPrints = async (
client: ClickHouseClient, client: ClickHouseClient,
limit: number limit: number
@ -177,6 +224,24 @@ export const fetchRecentEquityPrints = async (
return EquityPrintSchema.array().parse(rows.map(normalizeEquityRow)); return EquityPrintSchema.array().parse(rows.map(normalizeEquityRow));
}; };
export const fetchRecentFlowPackets = async (
client: ClickHouseClient,
limit: number
): Promise<FlowPacket[]> => {
const safeLimit = clampLimit(limit);
const result = await client.query({
query: `SELECT * FROM ${FLOW_PACKETS_TABLE} ORDER BY source_ts DESC, seq DESC LIMIT ${safeLimit}`,
format: "JSONEachRow"
});
const rows = await result.json<unknown[]>();
const records = rows
.map(normalizeFlowPacketRow)
.filter((record): record is FlowPacketRecord => record !== null);
const packets = records.map(fromFlowPacketRecord);
return FlowPacketSchema.array().parse(packets);
};
export const fetchOptionPrintsAfter = async ( export const fetchOptionPrintsAfter = async (
client: ClickHouseClient, client: ClickHouseClient,
afterTs: number, afterTs: number,

View file

@ -0,0 +1,70 @@
import type { FlowPacket } from "@islandflow/types";
export const FLOW_PACKETS_TABLE = "flow_packets";
export type FlowPacketRecord = {
source_ts: number;
ingest_ts: number;
seq: number;
trace_id: string;
id: string;
members: string[];
features_json: string;
join_quality_json: string;
};
export const flowPacketsTableDDL = (): string => {
return `
CREATE TABLE IF NOT EXISTS ${FLOW_PACKETS_TABLE} (
source_ts UInt64,
ingest_ts UInt64,
seq UInt64,
trace_id String,
id String,
members Array(String),
features_json String,
join_quality_json String
)
ENGINE = MergeTree
ORDER BY (source_ts, seq)
`;
};
export const toFlowPacketRecord = (packet: FlowPacket): FlowPacketRecord => {
return {
source_ts: packet.source_ts,
ingest_ts: packet.ingest_ts,
seq: packet.seq,
trace_id: packet.trace_id,
id: packet.id,
members: packet.members,
features_json: JSON.stringify(packet.features),
join_quality_json: JSON.stringify(packet.join_quality)
};
};
const safeJson = (value: string, fallback: Record<string, unknown>): Record<string, unknown> => {
try {
const parsed = JSON.parse(value);
if (parsed && typeof parsed === "object") {
return parsed as Record<string, unknown>;
}
} catch {
// ignore
}
return fallback;
};
export const fromFlowPacketRecord = (record: FlowPacketRecord): FlowPacket => {
return {
source_ts: record.source_ts,
ingest_ts: record.ingest_ts,
seq: record.seq,
trace_id: record.trace_id,
id: record.id,
members: record.members,
features: safeJson(record.features_json, {}),
join_quality: safeJson(record.join_quality_json, {}) as Record<string, number>
};
};

View file

@ -1,3 +1,4 @@
export * from "./clickhouse"; export * from "./clickhouse";
export * from "./flow-packets";
export * from "./equity-prints"; export * from "./equity-prints";
export * from "./option-prints"; export * from "./option-prints";

View file

@ -0,0 +1,39 @@
import { describe, expect, it } from "bun:test";
import {
flowPacketsTableDDL,
FLOW_PACKETS_TABLE,
fromFlowPacketRecord,
toFlowPacketRecord
} from "../src/flow-packets";
const packet = {
source_ts: 10,
ingest_ts: 20,
seq: 1,
trace_id: "fp-1",
id: "fp-1",
members: ["p1", "p2"],
features: {
option_contract_id: "SPY-2025-01-17-450-C",
count: 2,
total_size: 30
},
join_quality: {
nbbo_age_ms: 5
}
};
describe("flow-packets storage helpers", () => {
it("includes the correct table name in the DDL", () => {
const ddl = flowPacketsTableDDL();
expect(ddl).toContain(FLOW_PACKETS_TABLE);
expect(ddl).toContain("CREATE TABLE IF NOT EXISTS");
});
it("round-trips flow packet records", () => {
const record = toFlowPacketRecord(packet);
const restored = fromFlowPacketRecord(record);
expect(restored.features).toEqual(packet.features);
expect(restored.join_quality).toEqual(packet.join_quality);
});
});

View file

@ -13,7 +13,9 @@ import {
import { import {
createClickHouseClient, createClickHouseClient,
ensureEquityPrintsTable, ensureEquityPrintsTable,
ensureFlowPacketsTable,
ensureOptionPrintsTable, ensureOptionPrintsTable,
fetchRecentFlowPackets,
fetchEquityPrintsAfter, fetchEquityPrintsAfter,
fetchRecentEquityPrints, fetchRecentEquityPrints,
fetchOptionPrintsAfter, fetchOptionPrintsAfter,
@ -141,6 +143,7 @@ const run = async () => {
await ensureOptionPrintsTable(clickhouse); await ensureOptionPrintsTable(clickhouse);
await ensureEquityPrintsTable(clickhouse); await ensureEquityPrintsTable(clickhouse);
await ensureFlowPacketsTable(clickhouse);
const optionSubscription = await subscribeJson( const optionSubscription = await subscribeJson(
js, js,
@ -208,6 +211,12 @@ const run = async () => {
return jsonResponse({ data }); return jsonResponse({ data });
} }
if (req.method === "GET" && url.pathname === "/flow/packets") {
const limit = parseLimit(url.searchParams.get("limit"));
const data = await fetchRecentFlowPackets(clickhouse, limit);
return jsonResponse({ data });
}
if (req.method === "GET" && url.pathname === "/replay/options") { if (req.method === "GET" && url.pathname === "/replay/options") {
const { afterTs, afterSeq, limit } = parseReplayParams(url); const { afterTs, afterSeq, limit } = parseReplayParams(url);
const data = await fetchOptionPrintsAfter(clickhouse, afterTs, afterSeq, limit); const data = await fetchOptionPrintsAfter(clickhouse, afterTs, afterSeq, limit);

View file

@ -9,6 +9,7 @@
"@islandflow/bus": "workspace:*", "@islandflow/bus": "workspace:*",
"@islandflow/config": "workspace:*", "@islandflow/config": "workspace:*",
"@islandflow/observability": "workspace:*", "@islandflow/observability": "workspace:*",
"@islandflow/storage": "workspace:*",
"@islandflow/types": "workspace:*", "@islandflow/types": "workspace:*",
"zod": "^3.23.8" "zod": "^3.23.8"
} }

View file

@ -1,25 +1,137 @@
import { readEnv } from "@islandflow/config"; import { readEnv } from "@islandflow/config";
import { createLogger } from "@islandflow/observability"; import { createLogger } from "@islandflow/observability";
import { import {
SUBJECT_FLOW_PACKETS,
SUBJECT_OPTION_PRINTS, SUBJECT_OPTION_PRINTS,
STREAM_FLOW_PACKETS,
STREAM_OPTION_PRINTS, STREAM_OPTION_PRINTS,
buildDurableConsumer, buildDurableConsumer,
connectJetStreamWithRetry, connectJetStreamWithRetry,
ensureStream, ensureStream,
publishJson,
subscribeJson subscribeJson
} from "@islandflow/bus"; } from "@islandflow/bus";
import { OptionPrintSchema } from "@islandflow/types"; import {
createClickHouseClient,
ensureFlowPacketsTable,
insertFlowPacket
} from "@islandflow/storage";
import { FlowPacketSchema, OptionPrintSchema, type FlowPacket, type OptionPrint } from "@islandflow/types";
import { z } from "zod"; import { z } from "zod";
const service = "compute"; const service = "compute";
const logger = createLogger({ service }); const logger = createLogger({ service });
const envSchema = z.object({ const envSchema = z.object({
NATS_URL: z.string().default("nats://localhost:4222") NATS_URL: z.string().default("nats://localhost:4222"),
CLICKHOUSE_URL: z.string().default("http://localhost:8123"),
CLICKHOUSE_DATABASE: z.string().default("default"),
CLUSTER_WINDOW_MS: z.coerce.number().int().positive().default(500)
}); });
const env = readEnv(envSchema); const env = readEnv(envSchema);
type ClusterState = {
contractId: string;
startTs: number;
endTs: number;
startSourceTs: number;
endIngestTs: number;
endSeq: number;
members: string[];
totalSize: number;
totalPremium: number;
firstPrice: number;
lastPrice: number;
};
const clusters = new Map<string, ClusterState>();
const buildCluster = (print: OptionPrint): ClusterState => {
return {
contractId: print.option_contract_id,
startTs: print.ts,
endTs: print.ts,
startSourceTs: print.source_ts,
endIngestTs: print.ingest_ts,
endSeq: print.seq,
members: [print.trace_id],
totalSize: print.size,
totalPremium: print.price * print.size,
firstPrice: print.price,
lastPrice: print.price
};
};
const updateCluster = (cluster: ClusterState, print: OptionPrint): ClusterState => {
cluster.endTs = Math.max(cluster.endTs, print.ts);
cluster.endIngestTs = Math.max(cluster.endIngestTs, print.ingest_ts);
cluster.endSeq = Math.max(cluster.endSeq, print.seq);
cluster.members.push(print.trace_id);
cluster.totalSize += print.size;
cluster.totalPremium += print.price * print.size;
cluster.lastPrice = print.price;
return cluster;
};
const flushCluster = async (
clickhouse: ReturnType<typeof createClickHouseClient>,
js: Awaited<ReturnType<typeof connectJetStreamWithRetry>>["js"],
cluster: ClusterState
): Promise<void> => {
const features = {
option_contract_id: cluster.contractId,
count: cluster.members.length,
total_size: cluster.totalSize,
total_premium: Number(cluster.totalPremium.toFixed(4)),
first_price: cluster.firstPrice,
last_price: cluster.lastPrice,
start_ts: cluster.startTs,
end_ts: cluster.endTs,
window_ms: env.CLUSTER_WINDOW_MS
};
const packet: FlowPacket = {
source_ts: cluster.startSourceTs,
ingest_ts: cluster.endIngestTs,
seq: cluster.endSeq,
trace_id: `flowpacket:${cluster.contractId}:${cluster.startTs}:${cluster.endTs}`,
id: `flowpacket:${cluster.contractId}:${cluster.startTs}:${cluster.endTs}`,
members: cluster.members,
features,
join_quality: {}
};
const validated = FlowPacketSchema.parse(packet);
await insertFlowPacket(clickhouse, validated);
await publishJson(js, SUBJECT_FLOW_PACKETS, validated);
logger.info("emitted flow packet", {
id: validated.id,
contract: cluster.contractId,
count: cluster.members.length
});
};
const flushEligibleClusters = async (
clickhouse: ReturnType<typeof createClickHouseClient>,
js: Awaited<ReturnType<typeof connectJetStreamWithRetry>>["js"],
currentTs: number,
skipContractId: string
): Promise<void> => {
for (const [contractId, cluster] of clusters) {
if (contractId === skipContractId) {
continue;
}
if (currentTs - cluster.endTs > env.CLUSTER_WINDOW_MS) {
clusters.delete(contractId);
await flushCluster(clickhouse, js, cluster);
}
}
};
const run = async () => { const run = async () => {
logger.info("service starting"); logger.info("service starting");
@ -44,13 +156,42 @@ const run = async () => {
num_replicas: 1 num_replicas: 1
}); });
const opts = buildDurableConsumer("compute-option-prints"); await ensureStream(jsm, {
name: STREAM_FLOW_PACKETS,
subjects: [SUBJECT_FLOW_PACKETS],
retention: "limits",
storage: "file",
discard: "old",
max_msgs_per_subject: -1,
max_msgs: -1,
max_bytes: -1,
max_age: 0,
num_replicas: 1
});
const subscription = await subscribeJson(js, SUBJECT_OPTION_PRINTS, opts); const clickhouse = createClickHouseClient({
url: env.CLICKHOUSE_URL,
database: env.CLICKHOUSE_DATABASE
});
await ensureFlowPacketsTable(clickhouse);
const subscription = await subscribeJson(
js,
SUBJECT_OPTION_PRINTS,
buildDurableConsumer("compute-option-prints")
);
const shutdown = async (signal: string) => { const shutdown = async (signal: string) => {
logger.info("service stopping", { signal }); logger.info("service stopping", { signal });
for (const cluster of clusters.values()) {
await flushCluster(clickhouse, js, cluster);
}
clusters.clear();
await nc.drain(); await nc.drain();
await clickhouse.close();
process.exit(0); process.exit(0);
}; };
@ -60,11 +201,19 @@ const run = async () => {
for await (const msg of subscription.messages) { for await (const msg of subscription.messages) {
try { try {
const print = OptionPrintSchema.parse(subscription.decode(msg)); const print = OptionPrintSchema.parse(subscription.decode(msg));
logger.info("received option print", { await flushEligibleClusters(clickhouse, js, print.ts, print.option_contract_id);
trace_id: print.trace_id,
seq: print.seq, const existing = clusters.get(print.option_contract_id);
option_contract_id: print.option_contract_id if (!existing) {
}); clusters.set(print.option_contract_id, buildCluster(print));
} else if (print.ts - existing.startTs <= env.CLUSTER_WINDOW_MS) {
updateCluster(existing, print);
} else {
clusters.delete(print.option_contract_id);
await flushCluster(clickhouse, js, existing);
clusters.set(print.option_contract_id, buildCluster(print));
}
msg.ack(); msg.ack();
} catch (error) { } catch (error) {
logger.error("failed to process option print", { logger.error("failed to process option print", {