Add smart money replay evaluation harness

This commit is contained in:
dirtydishes 2026-05-05 02:08:16 -04:00
parent de6d25f046
commit 19a499d33c
4 changed files with 398 additions and 3 deletions

View file

@ -5,7 +5,7 @@
{"_type":"issue","id":"islandflow-0v6","title":"Fix tape freshness, NBBO coverage, pause controls, and filter popup","description":"Implement the tape fixes requested for synthetic options notional sizing, strict live freshness, live-mode pause/resume behavior, stronger NBBO snapshot coverage, and moving flow filters behind a popup. Includes server-side live cache changes, web terminal state/UI changes, and tests for synthetic pricing, live snapshot freshness/NBBO retention, and live pause/filter interactions.","status":"closed","priority":1,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T21:02:52Z","created_by":"dirtydishes","updated_at":"2026-04-28T21:13:38Z","started_at":"2026-04-28T21:02:57Z","closed_at":"2026-04-28T21:13:38Z","close_reason":"Completed","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-0v6","title":"Fix tape freshness, NBBO coverage, pause controls, and filter popup","description":"Implement the tape fixes requested for synthetic options notional sizing, strict live freshness, live-mode pause/resume behavior, stronger NBBO snapshot coverage, and moving flow filters behind a popup. Includes server-side live cache changes, web terminal state/UI changes, and tests for synthetic pricing, live snapshot freshness/NBBO retention, and live pause/filter interactions.","status":"closed","priority":1,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T21:02:52Z","created_by":"dirtydishes","updated_at":"2026-04-28T21:13:38Z","started_at":"2026-04-28T21:02:57Z","closed_at":"2026-04-28T21:13:38Z","close_reason":"Completed","dependency_count":0,"dependent_count":0,"comment_count":0}
{"_type":"issue","id":"islandflow-e4r","title":"Implement smart-money flow filtering and synthetic firehose modes","description":"Implement the approved multi-surface plan for named synthetic market profiles, options raw-vs-signal filtering, live/API filter contracts, Tape page client-side flow filters, firehose-readiness improvements, tests, and README updates.","status":"closed","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T20:10:49Z","created_by":"dirtydishes","updated_at":"2026-04-28T20:29:29Z","started_at":"2026-04-28T20:10:53Z","closed_at":"2026-04-28T20:29:29Z","close_reason":"Implemented synthetic market profiles, options signal-path filtering, signal-aware API/replay contracts, Tape page filters, tests, and README updates. Follow-up tracked in islandflow-biq.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-e4r","title":"Implement smart-money flow filtering and synthetic firehose modes","description":"Implement the approved multi-surface plan for named synthetic market profiles, options raw-vs-signal filtering, live/API filter contracts, Tape page client-side flow filters, firehose-readiness improvements, tests, and README updates.","status":"closed","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T20:10:49Z","created_by":"dirtydishes","updated_at":"2026-04-28T20:29:29Z","started_at":"2026-04-28T20:10:53Z","closed_at":"2026-04-28T20:29:29Z","close_reason":"Implemented synthetic market profiles, options signal-path filtering, signal-aware API/replay contracts, Tape page filters, tests, and README updates. Follow-up tracked in islandflow-biq.","dependency_count":0,"dependent_count":0,"comment_count":0}
{"_type":"issue","id":"islandflow-b6d","title":"Finish smart-money event-calendar enrichment","description":"Finish the smart-money event-calendar provider layer in services/refdata and connect days-to-event / expiry-after-event enrichment into compute using timestamp-available data only.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:26Z","created_by":"dirtydishes","updated_at":"2026-05-04T23:21:09Z","started_at":"2026-05-04T23:18:29Z","closed_at":"2026-05-04T23:21:09Z","close_reason":"Completed event-calendar provider and compute enrichment","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-b6d","title":"Finish smart-money event-calendar enrichment","description":"Finish the smart-money event-calendar provider layer in services/refdata and connect days-to-event / expiry-after-event enrichment into compute using timestamp-available data only.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:26Z","created_by":"dirtydishes","updated_at":"2026-05-04T23:21:09Z","started_at":"2026-05-04T23:18:29Z","closed_at":"2026-05-04T23:21:09Z","close_reason":"Completed event-calendar provider and compute enrichment","dependency_count":0,"dependent_count":0,"comment_count":0}
{"_type":"issue","id":"islandflow-e60","title":"Add smart-money replay evaluation harness","description":"Add replay-style live-vs-batch consistency tests plus evaluation utilities for parent-event precision/recall, calibration, abstention rate, and economic sanity checks.","status":"open","priority":2,"issue_type":"task","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:25Z","created_by":"dirtydishes","updated_at":"2026-05-04T21:35:25Z","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-e60","title":"Add smart-money replay evaluation harness","description":"Add replay-style live-vs-batch consistency tests plus evaluation utilities for parent-event precision/recall, calibration, abstention rate, and economic sanity checks.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:25Z","created_by":"dirtydishes","updated_at":"2026-05-05T06:08:08Z","started_at":"2026-05-05T06:07:22Z","closed_at":"2026-05-05T06:08:08Z","close_reason":"Completed smart-money replay consistency harness and evaluation utilities.","dependency_count":0,"dependent_count":0,"comment_count":0}
{"_type":"issue","id":"islandflow-020","title":"Rebuild synthetic smart-money scenarios","description":"Rework services/ingest-options synthetic generation around labeled parent-event templates for the six core smart-money profiles plus neutral background noise, with deterministic test/demo modes and hidden labels for tests.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:24Z","created_by":"dirtydishes","updated_at":"2026-05-05T05:29:27Z","started_at":"2026-05-05T05:25:39Z","closed_at":"2026-05-05T05:29:27Z","close_reason":"Completed Phase 5 synthetic smart-money scenario rebuild","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-020","title":"Rebuild synthetic smart-money scenarios","description":"Rework services/ingest-options synthetic generation around labeled parent-event templates for the six core smart-money profiles plus neutral background noise, with deterministic test/demo modes and hidden labels for tests.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:24Z","created_by":"dirtydishes","updated_at":"2026-05-05T05:29:27Z","started_at":"2026-05-05T05:25:39Z","closed_at":"2026-05-05T05:29:27Z","close_reason":"Completed Phase 5 synthetic smart-money scenario rebuild","dependency_count":0,"dependent_count":0,"comment_count":0}
{"_type":"issue","id":"islandflow-zs0","title":"Migrate terminal UI to smart-money profiles","description":"Migrate apps/web terminal rendering to consume SmartMoneyEvent directly: primary profile, probability ladder, reason codes, and suppression/abstention state, while preserving legacy alert/classifier displays during the bridge.","status":"closed","priority":2,"issue_type":"task","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:23Z","created_by":"dirtydishes","updated_at":"2026-05-05T05:39:58Z","closed_at":"2026-05-05T05:39:58Z","close_reason":"Completed terminal smart-money profile migration","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-zs0","title":"Migrate terminal UI to smart-money profiles","description":"Migrate apps/web terminal rendering to consume SmartMoneyEvent directly: primary profile, probability ladder, reason codes, and suppression/abstention state, while preserving legacy alert/classifier displays during the bridge.","status":"closed","priority":2,"issue_type":"task","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:23Z","created_by":"dirtydishes","updated_at":"2026-05-05T05:39:58Z","closed_at":"2026-05-05T05:39:58Z","close_reason":"Completed terminal smart-money profile migration","dependency_count":0,"dependent_count":0,"comment_count":0}
{"_type":"issue","id":"islandflow-igk","title":"Add plan mode","description":"Implement a user-facing plan mode in the application so users can switch into planning before taking action. Scope to be clarified from existing app patterns.","status":"closed","priority":2,"issue_type":"feature","owner":"dishes@dpdrm.com","created_at":"2026-05-04T04:22:37Z","created_by":"dirtydishes","updated_at":"2026-05-04T04:26:18Z","started_at":"2026-05-04T04:22:40Z","closed_at":"2026-05-04T04:26:18Z","close_reason":"Implemented as a global pi extension toggled with Shift+P","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-igk","title":"Add plan mode","description":"Implement a user-facing plan mode in the application so users can switch into planning before taking action. Scope to be clarified from existing app patterns.","status":"closed","priority":2,"issue_type":"feature","owner":"dishes@dpdrm.com","created_at":"2026-05-04T04:22:37Z","created_by":"dirtydishes","updated_at":"2026-05-04T04:26:18Z","started_at":"2026-05-04T04:22:40Z","closed_at":"2026-05-04T04:26:18Z","close_reason":"Implemented as a global pi extension toggled with Shift+P","dependency_count":0,"dependent_count":0,"comment_count":0}

View file

@ -52,8 +52,8 @@ Acceptance: old classifier and alert endpoints still work while `/flow/smart-mon
### Phase 7: Evaluation and Replay ### Phase 7: Evaluation and Replay
- [x] Add deterministic unit tests for parent-event scoring and storage. - [x] Add deterministic unit tests for parent-event scoring and storage.
- [ ] Add replay-style live-vs-batch consistency tests. - [x] Add replay-style live-vs-batch consistency tests.
- [ ] Add evaluation utilities for calibration, abstention rate, and economic sanity checks. - [x] Add evaluation utilities for calibration, abstention rate, and economic sanity checks.
## Migration Notes ## Migration Notes

View file

@ -0,0 +1,242 @@
import type { FlowPacket, SmartMoneyDirection, SmartMoneyEvent, SmartMoneyProfileId } from "@islandflow/types";
import { buildSmartMoneyEventFromPacket, type SmartMoneyParentEventOptions } from "./parent-events";
export type SmartMoneyLabel = {
event_id: string;
profile_id: SmartMoneyProfileId | null;
direction?: Exclude<SmartMoneyDirection, "unknown">;
realized_return_bps?: number;
};
export type ReplayConsistencyMismatch = {
event_id: string;
field: "missing_live" | "missing_batch" | "signature";
live?: SmartMoneyEventSignature;
batch?: SmartMoneyEventSignature;
};
export type ReplayConsistencyReport = {
live_count: number;
batch_count: number;
matched_count: number;
mismatches: ReplayConsistencyMismatch[];
consistent: boolean;
};
export type SmartMoneyEventSignature = {
event_id: string;
primary_profile_id: SmartMoneyProfileId | null;
primary_direction: SmartMoneyDirection;
abstained: boolean;
suppressed_reasons: string[];
profile_scores: Array<{
profile_id: SmartMoneyProfileId;
probability: number;
confidence_band: SmartMoneyEvent["profile_scores"][number]["confidence_band"];
direction: SmartMoneyDirection;
}>;
};
export type CalibrationBucket = {
min_probability: number;
max_probability: number;
count: number;
average_probability: number;
accuracy: number | null;
};
export type SmartMoneyEvaluationReport = {
sample_count: number;
labeled_count: number;
emitted_count: number;
abstained_count: number;
abstention_rate: number;
profile_precision: Partial<Record<SmartMoneyProfileId, number | null>>;
profile_recall: Partial<Record<SmartMoneyProfileId, number | null>>;
calibration: CalibrationBucket[];
economic_sanity: {
directional_count: number;
direction_hit_rate: number | null;
average_signed_return_bps: number | null;
};
};
const PROFILES: SmartMoneyProfileId[] = [
"institutional_directional",
"retail_whale",
"event_driven",
"vol_seller",
"arbitrage",
"hedge_reactive"
];
const directionalSign = (direction: SmartMoneyDirection): number => {
if (direction === "bullish") {
return 1;
}
if (direction === "bearish") {
return -1;
}
return 0;
};
const round = (value: number, digits = 4): number => {
if (!Number.isFinite(value)) {
return 0;
}
return Number(value.toFixed(digits));
};
export const smartMoneyEventSignature = (event: SmartMoneyEvent): SmartMoneyEventSignature => ({
event_id: event.event_id,
primary_profile_id: event.primary_profile_id,
primary_direction: event.primary_direction,
abstained: event.abstained,
suppressed_reasons: [...event.suppressed_reasons].sort(),
profile_scores: event.profile_scores.map((entry) => ({
profile_id: entry.profile_id,
probability: round(entry.probability, 6),
confidence_band: entry.confidence_band,
direction: entry.direction
}))
});
export const buildSmartMoneyEventsForReplay = (
packets: FlowPacket[],
optionsByPacketId: Record<string, SmartMoneyParentEventOptions | undefined> = {}
): SmartMoneyEvent[] => {
return packets
.slice()
.sort((a, b) => a.source_ts - b.source_ts || a.seq - b.seq || a.id.localeCompare(b.id))
.map((packet) => buildSmartMoneyEventFromPacket(packet, optionsByPacketId[packet.id]));
};
export const compareSmartMoneyReplayOutputs = (
liveEvents: SmartMoneyEvent[],
batchEvents: SmartMoneyEvent[]
): ReplayConsistencyReport => {
const liveById = new Map(liveEvents.map((event) => [event.event_id, smartMoneyEventSignature(event)]));
const batchById = new Map(batchEvents.map((event) => [event.event_id, smartMoneyEventSignature(event)]));
const ids = [...new Set([...liveById.keys(), ...batchById.keys()])].sort();
const mismatches: ReplayConsistencyMismatch[] = [];
for (const id of ids) {
const live = liveById.get(id);
const batch = batchById.get(id);
if (!live) {
mismatches.push({ event_id: id, field: "missing_live", batch });
continue;
}
if (!batch) {
mismatches.push({ event_id: id, field: "missing_batch", live });
continue;
}
if (JSON.stringify(live) !== JSON.stringify(batch)) {
mismatches.push({ event_id: id, field: "signature", live, batch });
}
}
return {
live_count: liveEvents.length,
batch_count: batchEvents.length,
matched_count: ids.length - mismatches.length,
mismatches,
consistent: mismatches.length === 0
};
};
export const evaluateSmartMoneyEvents = (
events: SmartMoneyEvent[],
labels: SmartMoneyLabel[],
bucketCount = 5
): SmartMoneyEvaluationReport => {
const labelsById = new Map(labels.map((label) => [label.event_id, label]));
const labeledEvents = events
.map((event) => ({ event, label: labelsById.get(event.event_id) }))
.filter((entry): entry is { event: SmartMoneyEvent; label: SmartMoneyLabel } => Boolean(entry.label));
const emitted = events.filter((event) => !event.abstained && event.primary_profile_id);
const profilePrecision: SmartMoneyEvaluationReport["profile_precision"] = {};
const profileRecall: SmartMoneyEvaluationReport["profile_recall"] = {};
for (const profile of PROFILES) {
const predicted = labeledEvents.filter((entry) => entry.event.primary_profile_id === profile);
const actual = labeledEvents.filter((entry) => entry.label.profile_id === profile);
const truePositive = predicted.filter((entry) => entry.label.profile_id === profile).length;
profilePrecision[profile] = predicted.length > 0 ? round(truePositive / predicted.length) : null;
profileRecall[profile] = actual.length > 0 ? round(truePositive / actual.length) : null;
}
const calibration = buildCalibration(labeledEvents, Math.max(1, Math.floor(bucketCount)));
const economic = buildEconomicSanity(labeledEvents);
return {
sample_count: events.length,
labeled_count: labeledEvents.length,
emitted_count: emitted.length,
abstained_count: events.filter((event) => event.abstained).length,
abstention_rate: events.length > 0 ? round(events.filter((event) => event.abstained).length / events.length) : 0,
profile_precision: profilePrecision,
profile_recall: profileRecall,
calibration,
economic_sanity: economic
};
};
const buildCalibration = (
entries: Array<{ event: SmartMoneyEvent; label: SmartMoneyLabel }>,
bucketCount: number
): CalibrationBucket[] => {
const buckets = Array.from({ length: bucketCount }, (_, index) => ({
min_probability: round(index / bucketCount),
max_probability: round((index + 1) / bucketCount),
probabilities: [] as number[],
correct: 0
}));
for (const { event, label } of entries) {
const probability = event.profile_scores.find((entry) => entry.profile_id === event.primary_profile_id)?.probability ?? 0;
const index = Math.min(bucketCount - 1, Math.floor(probability * bucketCount));
buckets[index].probabilities.push(probability);
if (!event.abstained && event.primary_profile_id === label.profile_id) {
buckets[index].correct += 1;
}
}
return buckets.map((bucket) => ({
min_probability: bucket.min_probability,
max_probability: bucket.max_probability,
count: bucket.probabilities.length,
average_probability:
bucket.probabilities.length > 0
? round(bucket.probabilities.reduce((sum, value) => sum + value, 0) / bucket.probabilities.length)
: 0,
accuracy: bucket.probabilities.length > 0 ? round(bucket.correct / bucket.probabilities.length) : null
}));
};
const buildEconomicSanity = (
entries: Array<{ event: SmartMoneyEvent; label: SmartMoneyLabel }>
): SmartMoneyEvaluationReport["economic_sanity"] => {
const directional = entries
.map(({ event, label }) => ({
sign: directionalSign(event.primary_direction),
realized: label.realized_return_bps
}))
.filter((entry): entry is { sign: number; realized: number } => entry.sign !== 0 && Number.isFinite(entry.realized));
if (directional.length === 0) {
return {
directional_count: 0,
direction_hit_rate: null,
average_signed_return_bps: null
};
}
const signedReturns = directional.map((entry) => entry.sign * entry.realized);
return {
directional_count: directional.length,
direction_hit_rate: round(signedReturns.filter((value) => value > 0).length / directional.length),
average_signed_return_bps: round(signedReturns.reduce((sum, value) => sum + value, 0) / signedReturns.length, 2)
};
};

View file

@ -0,0 +1,153 @@
import { describe, expect, it } from "bun:test";
import { buildSmartMoneyEventFromPacket } from "../src/parent-events";
import {
buildSmartMoneyEventsForReplay,
compareSmartMoneyReplayOutputs,
evaluateSmartMoneyEvents
} from "../src/smart-money-evaluation";
import { buildFlowPacket } from "./helpers";
const institutionalPacket = buildFlowPacket({
id: "flowpacket:eval-institutional",
seq: 2,
source_ts: Date.parse("2025-01-15T15:00:01Z"),
features: {
option_contract_id: "SPY-2025-02-21-450-C",
underlying_id: "SPY",
count: 8,
window_ms: 450,
total_size: 2200,
total_premium: 180_000,
total_notional: 18_000_000,
nbbo_coverage_ratio: 0.92,
nbbo_aggressive_ratio: 0.82,
nbbo_aggressive_buy_ratio: 0.78,
nbbo_aggressive_sell_ratio: 0.04,
nbbo_inside_ratio: 0.08,
underlying_mid: 448
}
});
const eventDrivenPacket = buildFlowPacket({
id: "flowpacket:eval-event-driven",
seq: 1,
source_ts: Date.parse("2025-01-15T15:00:00Z"),
features: {
option_contract_id: "AAPL-2025-02-07-225-C",
underlying_id: "AAPL",
count: 1,
window_ms: 450,
total_size: 1800,
total_premium: 160_000,
total_notional: 16_000_000,
nbbo_coverage_ratio: 0.5,
nbbo_aggressive_ratio: 0.4,
nbbo_aggressive_buy_ratio: 0.4,
nbbo_aggressive_sell_ratio: 0.1,
nbbo_inside_ratio: 0.08,
underlying_mid: 224
}
});
const stalePacket = buildFlowPacket({
id: "flowpacket:eval-stale",
seq: 3,
source_ts: Date.parse("2025-01-15T15:00:02Z"),
features: {
option_contract_id: "SPY-2025-02-21-450-C",
underlying_id: "SPY",
count: 8,
window_ms: 450,
total_size: 2200,
total_premium: 180_000,
nbbo_coverage_ratio: 0.1,
nbbo_missing_count: 8
}
});
const calendarOptions = {
"flowpacket:eval-event-driven": {
eventCalendarMatch: {
underlying_id: "AAPL",
event_ts: Date.parse("2025-01-31T21:00:00Z"),
event_kind: "earnings",
announced_ts: Date.parse("2024-12-20T21:00:00Z"),
days_to_event: 16.25
}
}
};
describe("smart money evaluation utilities", () => {
it("compares replay-style live and batch outputs with stable event signatures", () => {
const liveEvents = [institutionalPacket, eventDrivenPacket, stalePacket].map((packet) =>
buildSmartMoneyEventFromPacket(packet, calendarOptions[packet.id])
);
const batchEvents = buildSmartMoneyEventsForReplay(
[stalePacket, institutionalPacket, eventDrivenPacket],
calendarOptions
);
const report = compareSmartMoneyReplayOutputs(liveEvents, batchEvents);
expect(report.consistent).toBe(true);
expect(report.live_count).toBe(3);
expect(report.batch_count).toBe(3);
expect(report.matched_count).toBe(3);
expect(report.mismatches).toEqual([]);
});
it("reports signature mismatches when live and batch scoring diverge", () => {
const liveEvent = buildSmartMoneyEventFromPacket(institutionalPacket);
const batchEvent = {
...liveEvent,
primary_profile_id: "retail_whale" as const
};
const report = compareSmartMoneyReplayOutputs([liveEvent], [batchEvent]);
expect(report.consistent).toBe(false);
expect(report.mismatches).toHaveLength(1);
expect(report.mismatches[0]?.field).toBe("signature");
});
it("summarizes precision, recall, calibration, abstention rate, and economic sanity", () => {
const events = buildSmartMoneyEventsForReplay(
[institutionalPacket, eventDrivenPacket, stalePacket],
calendarOptions
);
const report = evaluateSmartMoneyEvents(
events,
[
{
event_id: "smartmoney:single_leg_event:flowpacket:eval-institutional",
profile_id: "institutional_directional",
direction: "bullish",
realized_return_bps: 42
},
{
event_id: "smartmoney:single_leg_event:flowpacket:eval-event-driven",
profile_id: "event_driven",
direction: "bullish",
realized_return_bps: 18
},
{
event_id: "smartmoney:single_leg_event:flowpacket:eval-stale",
profile_id: null,
realized_return_bps: -12
}
],
4
);
expect(report.sample_count).toBe(3);
expect(report.labeled_count).toBe(3);
expect(report.emitted_count).toBe(2);
expect(report.abstained_count).toBe(1);
expect(report.abstention_rate).toBeCloseTo(1 / 3);
expect(report.profile_precision.institutional_directional).toBe(1);
expect(report.profile_recall.event_driven).toBe(1);
expect(report.calibration).toHaveLength(4);
expect(report.calibration.reduce((sum, bucket) => sum + bucket.count, 0)).toBe(3);
expect(report.economic_sanity.directional_count).toBe(2);
expect(report.economic_sanity.direction_hit_rate).toBe(1);
expect(report.economic_sanity.average_signed_return_bps).toBe(30);
});
});