From 19a499d33c7d15876ddcde749262468ba433ef7c Mon Sep 17 00:00:00 2001 From: dirtydishes Date: Tue, 5 May 2026 02:08:16 -0400 Subject: [PATCH] Add smart money replay evaluation harness --- .beads/issues.jsonl | 2 +- SMART_MONEY_REBUILD_PLAN.md | 4 +- .../compute/src/smart-money-evaluation.ts | 242 ++++++++++++++++++ .../tests/smart-money-evaluation.test.ts | 153 +++++++++++ 4 files changed, 398 insertions(+), 3 deletions(-) create mode 100644 services/compute/src/smart-money-evaluation.ts create mode 100644 services/compute/tests/smart-money-evaluation.test.ts diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index c21246b..74fca47 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -5,7 +5,7 @@ {"_type":"issue","id":"islandflow-0v6","title":"Fix tape freshness, NBBO coverage, pause controls, and filter popup","description":"Implement the tape fixes requested for synthetic options notional sizing, strict live freshness, live-mode pause/resume behavior, stronger NBBO snapshot coverage, and moving flow filters behind a popup. Includes server-side live cache changes, web terminal state/UI changes, and tests for synthetic pricing, live snapshot freshness/NBBO retention, and live pause/filter interactions.","status":"closed","priority":1,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T21:02:52Z","created_by":"dirtydishes","updated_at":"2026-04-28T21:13:38Z","started_at":"2026-04-28T21:02:57Z","closed_at":"2026-04-28T21:13:38Z","close_reason":"Completed","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-e4r","title":"Implement smart-money flow filtering and synthetic firehose modes","description":"Implement the approved multi-surface plan for named synthetic market profiles, options raw-vs-signal filtering, live/API filter contracts, Tape page client-side flow filters, firehose-readiness improvements, tests, and README updates.","status":"closed","priority":1,"issue_type":"feature","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-04-28T20:10:49Z","created_by":"dirtydishes","updated_at":"2026-04-28T20:29:29Z","started_at":"2026-04-28T20:10:53Z","closed_at":"2026-04-28T20:29:29Z","close_reason":"Implemented synthetic market profiles, options signal-path filtering, signal-aware API/replay contracts, Tape page filters, tests, and README updates. Follow-up tracked in islandflow-biq.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-b6d","title":"Finish smart-money event-calendar enrichment","description":"Finish the smart-money event-calendar provider layer in services/refdata and connect days-to-event / expiry-after-event enrichment into compute using timestamp-available data only.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:26Z","created_by":"dirtydishes","updated_at":"2026-05-04T23:21:09Z","started_at":"2026-05-04T23:18:29Z","closed_at":"2026-05-04T23:21:09Z","close_reason":"Completed event-calendar provider and compute enrichment","dependency_count":0,"dependent_count":0,"comment_count":0} -{"_type":"issue","id":"islandflow-e60","title":"Add smart-money replay evaluation harness","description":"Add replay-style live-vs-batch consistency tests plus evaluation utilities for parent-event precision/recall, calibration, abstention rate, and economic sanity checks.","status":"open","priority":2,"issue_type":"task","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:25Z","created_by":"dirtydishes","updated_at":"2026-05-04T21:35:25Z","dependency_count":0,"dependent_count":0,"comment_count":0} +{"_type":"issue","id":"islandflow-e60","title":"Add smart-money replay evaluation harness","description":"Add replay-style live-vs-batch consistency tests plus evaluation utilities for parent-event precision/recall, calibration, abstention rate, and economic sanity checks.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:25Z","created_by":"dirtydishes","updated_at":"2026-05-05T06:08:08Z","started_at":"2026-05-05T06:07:22Z","closed_at":"2026-05-05T06:08:08Z","close_reason":"Completed smart-money replay consistency harness and evaluation utilities.","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-020","title":"Rebuild synthetic smart-money scenarios","description":"Rework services/ingest-options synthetic generation around labeled parent-event templates for the six core smart-money profiles plus neutral background noise, with deterministic test/demo modes and hidden labels for tests.","status":"closed","priority":2,"issue_type":"task","assignee":"dirtydishes","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:24Z","created_by":"dirtydishes","updated_at":"2026-05-05T05:29:27Z","started_at":"2026-05-05T05:25:39Z","closed_at":"2026-05-05T05:29:27Z","close_reason":"Completed Phase 5 synthetic smart-money scenario rebuild","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-zs0","title":"Migrate terminal UI to smart-money profiles","description":"Migrate apps/web terminal rendering to consume SmartMoneyEvent directly: primary profile, probability ladder, reason codes, and suppression/abstention state, while preserving legacy alert/classifier displays during the bridge.","status":"closed","priority":2,"issue_type":"task","owner":"dishes@dpdrm.com","created_at":"2026-05-04T21:35:23Z","created_by":"dirtydishes","updated_at":"2026-05-05T05:39:58Z","closed_at":"2026-05-05T05:39:58Z","close_reason":"Completed terminal smart-money profile migration","dependency_count":0,"dependent_count":0,"comment_count":0} {"_type":"issue","id":"islandflow-igk","title":"Add plan mode","description":"Implement a user-facing plan mode in the application so users can switch into planning before taking action. Scope to be clarified from existing app patterns.","status":"closed","priority":2,"issue_type":"feature","owner":"dishes@dpdrm.com","created_at":"2026-05-04T04:22:37Z","created_by":"dirtydishes","updated_at":"2026-05-04T04:26:18Z","started_at":"2026-05-04T04:22:40Z","closed_at":"2026-05-04T04:26:18Z","close_reason":"Implemented as a global pi extension toggled with Shift+P","dependency_count":0,"dependent_count":0,"comment_count":0} diff --git a/SMART_MONEY_REBUILD_PLAN.md b/SMART_MONEY_REBUILD_PLAN.md index ae250d8..0f41ba1 100644 --- a/SMART_MONEY_REBUILD_PLAN.md +++ b/SMART_MONEY_REBUILD_PLAN.md @@ -52,8 +52,8 @@ Acceptance: old classifier and alert endpoints still work while `/flow/smart-mon ### Phase 7: Evaluation and Replay - [x] Add deterministic unit tests for parent-event scoring and storage. -- [ ] Add replay-style live-vs-batch consistency tests. -- [ ] Add evaluation utilities for calibration, abstention rate, and economic sanity checks. +- [x] Add replay-style live-vs-batch consistency tests. +- [x] Add evaluation utilities for calibration, abstention rate, and economic sanity checks. ## Migration Notes diff --git a/services/compute/src/smart-money-evaluation.ts b/services/compute/src/smart-money-evaluation.ts new file mode 100644 index 0000000..f2c4271 --- /dev/null +++ b/services/compute/src/smart-money-evaluation.ts @@ -0,0 +1,242 @@ +import type { FlowPacket, SmartMoneyDirection, SmartMoneyEvent, SmartMoneyProfileId } from "@islandflow/types"; +import { buildSmartMoneyEventFromPacket, type SmartMoneyParentEventOptions } from "./parent-events"; + +export type SmartMoneyLabel = { + event_id: string; + profile_id: SmartMoneyProfileId | null; + direction?: Exclude; + realized_return_bps?: number; +}; + +export type ReplayConsistencyMismatch = { + event_id: string; + field: "missing_live" | "missing_batch" | "signature"; + live?: SmartMoneyEventSignature; + batch?: SmartMoneyEventSignature; +}; + +export type ReplayConsistencyReport = { + live_count: number; + batch_count: number; + matched_count: number; + mismatches: ReplayConsistencyMismatch[]; + consistent: boolean; +}; + +export type SmartMoneyEventSignature = { + event_id: string; + primary_profile_id: SmartMoneyProfileId | null; + primary_direction: SmartMoneyDirection; + abstained: boolean; + suppressed_reasons: string[]; + profile_scores: Array<{ + profile_id: SmartMoneyProfileId; + probability: number; + confidence_band: SmartMoneyEvent["profile_scores"][number]["confidence_band"]; + direction: SmartMoneyDirection; + }>; +}; + +export type CalibrationBucket = { + min_probability: number; + max_probability: number; + count: number; + average_probability: number; + accuracy: number | null; +}; + +export type SmartMoneyEvaluationReport = { + sample_count: number; + labeled_count: number; + emitted_count: number; + abstained_count: number; + abstention_rate: number; + profile_precision: Partial>; + profile_recall: Partial>; + calibration: CalibrationBucket[]; + economic_sanity: { + directional_count: number; + direction_hit_rate: number | null; + average_signed_return_bps: number | null; + }; +}; + +const PROFILES: SmartMoneyProfileId[] = [ + "institutional_directional", + "retail_whale", + "event_driven", + "vol_seller", + "arbitrage", + "hedge_reactive" +]; + +const directionalSign = (direction: SmartMoneyDirection): number => { + if (direction === "bullish") { + return 1; + } + if (direction === "bearish") { + return -1; + } + return 0; +}; + +const round = (value: number, digits = 4): number => { + if (!Number.isFinite(value)) { + return 0; + } + return Number(value.toFixed(digits)); +}; + +export const smartMoneyEventSignature = (event: SmartMoneyEvent): SmartMoneyEventSignature => ({ + event_id: event.event_id, + primary_profile_id: event.primary_profile_id, + primary_direction: event.primary_direction, + abstained: event.abstained, + suppressed_reasons: [...event.suppressed_reasons].sort(), + profile_scores: event.profile_scores.map((entry) => ({ + profile_id: entry.profile_id, + probability: round(entry.probability, 6), + confidence_band: entry.confidence_band, + direction: entry.direction + })) +}); + +export const buildSmartMoneyEventsForReplay = ( + packets: FlowPacket[], + optionsByPacketId: Record = {} +): SmartMoneyEvent[] => { + return packets + .slice() + .sort((a, b) => a.source_ts - b.source_ts || a.seq - b.seq || a.id.localeCompare(b.id)) + .map((packet) => buildSmartMoneyEventFromPacket(packet, optionsByPacketId[packet.id])); +}; + +export const compareSmartMoneyReplayOutputs = ( + liveEvents: SmartMoneyEvent[], + batchEvents: SmartMoneyEvent[] +): ReplayConsistencyReport => { + const liveById = new Map(liveEvents.map((event) => [event.event_id, smartMoneyEventSignature(event)])); + const batchById = new Map(batchEvents.map((event) => [event.event_id, smartMoneyEventSignature(event)])); + const ids = [...new Set([...liveById.keys(), ...batchById.keys()])].sort(); + const mismatches: ReplayConsistencyMismatch[] = []; + + for (const id of ids) { + const live = liveById.get(id); + const batch = batchById.get(id); + if (!live) { + mismatches.push({ event_id: id, field: "missing_live", batch }); + continue; + } + if (!batch) { + mismatches.push({ event_id: id, field: "missing_batch", live }); + continue; + } + if (JSON.stringify(live) !== JSON.stringify(batch)) { + mismatches.push({ event_id: id, field: "signature", live, batch }); + } + } + + return { + live_count: liveEvents.length, + batch_count: batchEvents.length, + matched_count: ids.length - mismatches.length, + mismatches, + consistent: mismatches.length === 0 + }; +}; + +export const evaluateSmartMoneyEvents = ( + events: SmartMoneyEvent[], + labels: SmartMoneyLabel[], + bucketCount = 5 +): SmartMoneyEvaluationReport => { + const labelsById = new Map(labels.map((label) => [label.event_id, label])); + const labeledEvents = events + .map((event) => ({ event, label: labelsById.get(event.event_id) })) + .filter((entry): entry is { event: SmartMoneyEvent; label: SmartMoneyLabel } => Boolean(entry.label)); + + const emitted = events.filter((event) => !event.abstained && event.primary_profile_id); + const profilePrecision: SmartMoneyEvaluationReport["profile_precision"] = {}; + const profileRecall: SmartMoneyEvaluationReport["profile_recall"] = {}; + + for (const profile of PROFILES) { + const predicted = labeledEvents.filter((entry) => entry.event.primary_profile_id === profile); + const actual = labeledEvents.filter((entry) => entry.label.profile_id === profile); + const truePositive = predicted.filter((entry) => entry.label.profile_id === profile).length; + profilePrecision[profile] = predicted.length > 0 ? round(truePositive / predicted.length) : null; + profileRecall[profile] = actual.length > 0 ? round(truePositive / actual.length) : null; + } + + const calibration = buildCalibration(labeledEvents, Math.max(1, Math.floor(bucketCount))); + const economic = buildEconomicSanity(labeledEvents); + + return { + sample_count: events.length, + labeled_count: labeledEvents.length, + emitted_count: emitted.length, + abstained_count: events.filter((event) => event.abstained).length, + abstention_rate: events.length > 0 ? round(events.filter((event) => event.abstained).length / events.length) : 0, + profile_precision: profilePrecision, + profile_recall: profileRecall, + calibration, + economic_sanity: economic + }; +}; + +const buildCalibration = ( + entries: Array<{ event: SmartMoneyEvent; label: SmartMoneyLabel }>, + bucketCount: number +): CalibrationBucket[] => { + const buckets = Array.from({ length: bucketCount }, (_, index) => ({ + min_probability: round(index / bucketCount), + max_probability: round((index + 1) / bucketCount), + probabilities: [] as number[], + correct: 0 + })); + + for (const { event, label } of entries) { + const probability = event.profile_scores.find((entry) => entry.profile_id === event.primary_profile_id)?.probability ?? 0; + const index = Math.min(bucketCount - 1, Math.floor(probability * bucketCount)); + buckets[index].probabilities.push(probability); + if (!event.abstained && event.primary_profile_id === label.profile_id) { + buckets[index].correct += 1; + } + } + + return buckets.map((bucket) => ({ + min_probability: bucket.min_probability, + max_probability: bucket.max_probability, + count: bucket.probabilities.length, + average_probability: + bucket.probabilities.length > 0 + ? round(bucket.probabilities.reduce((sum, value) => sum + value, 0) / bucket.probabilities.length) + : 0, + accuracy: bucket.probabilities.length > 0 ? round(bucket.correct / bucket.probabilities.length) : null + })); +}; + +const buildEconomicSanity = ( + entries: Array<{ event: SmartMoneyEvent; label: SmartMoneyLabel }> +): SmartMoneyEvaluationReport["economic_sanity"] => { + const directional = entries + .map(({ event, label }) => ({ + sign: directionalSign(event.primary_direction), + realized: label.realized_return_bps + })) + .filter((entry): entry is { sign: number; realized: number } => entry.sign !== 0 && Number.isFinite(entry.realized)); + + if (directional.length === 0) { + return { + directional_count: 0, + direction_hit_rate: null, + average_signed_return_bps: null + }; + } + + const signedReturns = directional.map((entry) => entry.sign * entry.realized); + return { + directional_count: directional.length, + direction_hit_rate: round(signedReturns.filter((value) => value > 0).length / directional.length), + average_signed_return_bps: round(signedReturns.reduce((sum, value) => sum + value, 0) / signedReturns.length, 2) + }; +}; diff --git a/services/compute/tests/smart-money-evaluation.test.ts b/services/compute/tests/smart-money-evaluation.test.ts new file mode 100644 index 0000000..fac7ff7 --- /dev/null +++ b/services/compute/tests/smart-money-evaluation.test.ts @@ -0,0 +1,153 @@ +import { describe, expect, it } from "bun:test"; +import { buildSmartMoneyEventFromPacket } from "../src/parent-events"; +import { + buildSmartMoneyEventsForReplay, + compareSmartMoneyReplayOutputs, + evaluateSmartMoneyEvents +} from "../src/smart-money-evaluation"; +import { buildFlowPacket } from "./helpers"; + +const institutionalPacket = buildFlowPacket({ + id: "flowpacket:eval-institutional", + seq: 2, + source_ts: Date.parse("2025-01-15T15:00:01Z"), + features: { + option_contract_id: "SPY-2025-02-21-450-C", + underlying_id: "SPY", + count: 8, + window_ms: 450, + total_size: 2200, + total_premium: 180_000, + total_notional: 18_000_000, + nbbo_coverage_ratio: 0.92, + nbbo_aggressive_ratio: 0.82, + nbbo_aggressive_buy_ratio: 0.78, + nbbo_aggressive_sell_ratio: 0.04, + nbbo_inside_ratio: 0.08, + underlying_mid: 448 + } +}); + +const eventDrivenPacket = buildFlowPacket({ + id: "flowpacket:eval-event-driven", + seq: 1, + source_ts: Date.parse("2025-01-15T15:00:00Z"), + features: { + option_contract_id: "AAPL-2025-02-07-225-C", + underlying_id: "AAPL", + count: 1, + window_ms: 450, + total_size: 1800, + total_premium: 160_000, + total_notional: 16_000_000, + nbbo_coverage_ratio: 0.5, + nbbo_aggressive_ratio: 0.4, + nbbo_aggressive_buy_ratio: 0.4, + nbbo_aggressive_sell_ratio: 0.1, + nbbo_inside_ratio: 0.08, + underlying_mid: 224 + } +}); + +const stalePacket = buildFlowPacket({ + id: "flowpacket:eval-stale", + seq: 3, + source_ts: Date.parse("2025-01-15T15:00:02Z"), + features: { + option_contract_id: "SPY-2025-02-21-450-C", + underlying_id: "SPY", + count: 8, + window_ms: 450, + total_size: 2200, + total_premium: 180_000, + nbbo_coverage_ratio: 0.1, + nbbo_missing_count: 8 + } +}); + +const calendarOptions = { + "flowpacket:eval-event-driven": { + eventCalendarMatch: { + underlying_id: "AAPL", + event_ts: Date.parse("2025-01-31T21:00:00Z"), + event_kind: "earnings", + announced_ts: Date.parse("2024-12-20T21:00:00Z"), + days_to_event: 16.25 + } + } +}; + +describe("smart money evaluation utilities", () => { + it("compares replay-style live and batch outputs with stable event signatures", () => { + const liveEvents = [institutionalPacket, eventDrivenPacket, stalePacket].map((packet) => + buildSmartMoneyEventFromPacket(packet, calendarOptions[packet.id]) + ); + const batchEvents = buildSmartMoneyEventsForReplay( + [stalePacket, institutionalPacket, eventDrivenPacket], + calendarOptions + ); + + const report = compareSmartMoneyReplayOutputs(liveEvents, batchEvents); + expect(report.consistent).toBe(true); + expect(report.live_count).toBe(3); + expect(report.batch_count).toBe(3); + expect(report.matched_count).toBe(3); + expect(report.mismatches).toEqual([]); + }); + + it("reports signature mismatches when live and batch scoring diverge", () => { + const liveEvent = buildSmartMoneyEventFromPacket(institutionalPacket); + const batchEvent = { + ...liveEvent, + primary_profile_id: "retail_whale" as const + }; + + const report = compareSmartMoneyReplayOutputs([liveEvent], [batchEvent]); + expect(report.consistent).toBe(false); + expect(report.mismatches).toHaveLength(1); + expect(report.mismatches[0]?.field).toBe("signature"); + }); + + it("summarizes precision, recall, calibration, abstention rate, and economic sanity", () => { + const events = buildSmartMoneyEventsForReplay( + [institutionalPacket, eventDrivenPacket, stalePacket], + calendarOptions + ); + const report = evaluateSmartMoneyEvents( + events, + [ + { + event_id: "smartmoney:single_leg_event:flowpacket:eval-institutional", + profile_id: "institutional_directional", + direction: "bullish", + realized_return_bps: 42 + }, + { + event_id: "smartmoney:single_leg_event:flowpacket:eval-event-driven", + profile_id: "event_driven", + direction: "bullish", + realized_return_bps: 18 + }, + { + event_id: "smartmoney:single_leg_event:flowpacket:eval-stale", + profile_id: null, + realized_return_bps: -12 + } + ], + 4 + ); + + expect(report.sample_count).toBe(3); + expect(report.labeled_count).toBe(3); + expect(report.emitted_count).toBe(2); + expect(report.abstained_count).toBe(1); + expect(report.abstention_rate).toBeCloseTo(1 / 3); + expect(report.profile_precision.institutional_directional).toBe(1); + expect(report.profile_recall.event_driven).toBe(1); + expect(report.calibration).toHaveLength(4); + expect(report.calibration.reduce((sum, bucket) => sum + bucket.count, 0)).toBe(3); + expect(report.economic_sanity.directional_count).toBe(2); + expect(report.economic_sanity.direction_hit_rate).toBe(1); + expect(report.economic_sanity.average_signed_return_bps).toBe(30); + }); +});