Code · src/agents/baseline/normalization/index.ts

src/agents/baseline/normalization/index.ts 12,775 bytes · typescript
/**
 * Normalization agent — runtime entry point.
 *
 * Walks the matrix-defined runbook (Std 6). Steps 1, 4 (when learned
 * rules apply), 5, 6, 7 are deterministic; steps 2/3 fall back to the
 * LLM for novel labels / entities (Std 3 — judgment is explicit).
 */

import {
  type AgentResult,
  type HITLEscalation,
  LOW_CONFIDENCE_THRESHOLD,
  makeConfidence,
} from '../../../standards.js';
import type {
  ExecutionContext,
  FailureObject,
  Handoff,
  JobRequest,
  Lineage,
  UnresolvedIssue,
} from '../../../types.js';
import { nowIso } from '../../../types.js';
import {
  AGENT_NAME,
  AGENT_VERSION,
  normalizationContract,
} from './matrix.js';
import {
  type NormalizationInput,
  type NormalizationOutput,
  type NormalizedRecord,
  type LearnedRule,
  normalizationInputSchema,
} from './schema.js';
import { LearnedMappingStore, normalizeKey } from './storage.js';
import { convertUnit, mapTerminology } from './llm.js';

export { normalizationContract } from './matrix.js';
export type { NormalizationOutput } from './schema.js';

export interface NormalizationSideContext {
  readonly jobRequest: JobRequest;
  readonly upstreamLineage: Lineage;
}

function trace(ctx: ExecutionContext, standard: number, step: string, detail: string): void {
  ctx.trace.push({ agent: AGENT_NAME, standard, step, detail, at: nowIso() });
  // eslint-disable-next-line no-console
  console.log(`  [${AGENT_NAME}][Std ${standard}] ${step} — ${detail}`);
}

function failure(
  ctx: ExecutionContext,
  category: FailureObject['category'],
  reason: string,
  context: Record<string, unknown>,
  lineage: Lineage,
): FailureObject {
  return {
    agent: AGENT_NAME,
    agentVersion: AGENT_VERSION,
    category,
    reason,
    context,
    lineage,
    attempts: ctx.retries,
    recursionDepth: ctx.recursionDepth,
    occurredAt: nowIso(),
  };
}

export async function runNormalization(
  rawInput: unknown,
  side: NormalizationSideContext,
  ctx: ExecutionContext,
): Promise<AgentResult<NormalizationOutput>> {
  /* Step 1 (Std 2): validate-input. */
  const parsed = normalizationInputSchema.safeParse(rawInput);
  if (!parsed.success) {
    return {
      ok: false,
      escalations: [],
      failure: failure(ctx, 'invalid-input', 'Normalization input failed schema validation.', { issues: parsed.error.issues }, side.upstreamLineage),
    };
  }
  const input: NormalizationInput = parsed.data;
  trace(ctx, 2, normalizationContract.runbook[0]!.name,
    `received ${input.values.length} value(s); ${input.comparabilityNotes.length} comparability note(s)`);

  /* Load learned mappings + JobRequest-supplied entities (Std 10). */
  const store = new LearnedMappingStore();
  await store.load();

  /* Seed entity lookups from JobRequest.entities. */
  for (const e of side.jobRequest.entities) {
    store.learnEntity(e.id, e.id, 'seed', 1);
    for (const a of e.aliases) store.learnEntity(a, e.id, 'seed', 1);
  }
  /* Seed metric lookups from JobRequest.targetMetrics + seedMappings. */
  for (const m of side.jobRequest.targetMetrics) {
    store.learnMetric(m.key, m.key, 'seed', 1);
  }
  for (const sm of side.jobRequest.seedMappings ?? []) {
    store.learnMetric(sm.sourceLabel, sm.targetKey, 'seed', 1);
  }

  const records: NormalizedRecord[] = [];
  const unresolved: UnresolvedIssue[] = [];
  const escalations: HITLEscalation[] = [];
  const targetUnit = side.jobRequest.targetMetrics[0]?.unit ?? '';

  for (const v of input.values) {
    const appliedRules: string[] = [];
    const flags = [...v.flags];

    /* Step 2 (Std 5): normalize-entities — learned lookup first; LLM if novel. */
    let canonicalEntity = v.entity;
    const eHit = store.lookupEntity(v.entity);
    if (eHit) {
      canonicalEntity = eHit.canonical;
      appliedRules.push(`entity-resolver:${eHit.by}->${eHit.canonical}`);
    }

    /* Step 3 (Std 3 + Std 5): normalize-terminology — learned lookup first; LLM for novel. */
    let canonicalMetric: string | null = null;
    let metricConfidence = 0.9;
    const mHit = store.lookupMetric(v.rawLabel);
    if (mHit) {
      canonicalMetric = mHit.canonical;
      metricConfidence = mHit.by === 'seed' ? 0.95 : Math.max(0.8, mHit.confidence);
      appliedRules.push(`taxonomy:${mHit.by}->${mHit.canonical}`);
    } else if (
      /* Std 6 — cost-appropriate execution: when Source/Extraction has
       * already confidently tied this rawLabel to a target metric
       * (the upstream selected rawLabel specifically to satisfy that
       * metricKey), the mapping is established. Record the inherited
       * determination and proceed — no LLM call. Std 7 — confidence
       * inherits because this agent introduces no new uncertainty. */
      v.metricKey
      && v.confidence >= LOW_CONFIDENCE_THRESHOLD
      && side.jobRequest.targetMetrics.some(m => m.key === v.metricKey)
    ) {
      canonicalMetric = v.metricKey;
      metricConfidence = v.confidence;
      appliedRules.push(`taxonomy:inherited-from-source-extraction->${v.metricKey}`);
      store.learnMetric(
        v.rawLabel,
        v.metricKey,
        'lookup',
        v.confidence,
        `Inherited from Source/Extraction (Std 6/7): rawLabel "${v.rawLabel}" was chosen specifically to satisfy target metric "${v.metricKey}" at upstream confidence ${v.confidence.toFixed(2)}.`,
      );
    } else {
      /* Std 3: judgment step — defer to LLM with citation. Reserved
       * for genuinely novel or ambiguous labels (no learned rule and
       * no confident upstream determination). */
      const r = await mapTerminology({
        rawLabel: v.rawLabel,
        entity: v.entity,
        vocabulary: side.jobRequest.targetMetrics.map(m => ({ key: m.key, definition: m.definition })),
      });
      if (r.ok) {
        canonicalMetric = r.value.canonical;
        metricConfidence = r.value.confidence;
        appliedRules.push(`ai-with-citation:${r.value.rationale}`);
        if (canonicalMetric) {
          store.learnMetric(v.rawLabel, canonicalMetric, 'ai', metricConfidence, r.value.rationale);
        }
      } else {
        unresolved.push({
          category: r.failure.category === 'needs-api-key' ? 'unmapped-term' : 'unmapped-term',
          detail: `could not map "${v.rawLabel}" for ${v.entity}: ${r.failure.reason}`,
          blocking: false,
          context: { failure: r.failure },
        });
        if (r.failure.category === 'needs-api-key') {
          escalations.push({
            agent: AGENT_NAME,
            reason: 'unresolved-mapping',
            failureContext: `no learned rule for "${v.rawLabel}" and LLM unavailable`,
            lineage: side.upstreamLineage,
            validation: {
              status: 'review',
              confidence: makeConfidence(0, 'LLM unavailable'),
              checks: [{ name: 'mapping-available', passed: false }],
            },
            recommendedReviewer: 'domain-expert',
            raisedAt: nowIso(),
          });
        }
        continue;
      }
    }
    if (!canonicalMetric) {
      unresolved.push({
        category: 'unmapped-term',
        detail: `LLM declined to map "${v.rawLabel}"`,
        blocking: false,
      });
      continue;
    }

    /* Step 4 (Std 5): normalize-units. Deterministic identity-pass when units match;
     * defer to LLM with citation only when they differ. */
    let canonicalValue: number | null = v.value;
    let canonicalUnit: string = v.rawUnit ?? targetUnit;
    if (v.value !== null && v.rawUnit) {
      if (v.rawUnit === targetUnit || targetUnit === '') {
        canonicalUnit = v.rawUnit;
        appliedRules.push(`unit:identity:${v.rawUnit}`);
      } else {
        const conv = await convertUnit({ rawValue: v.value, rawUnit: v.rawUnit, targetUnit });
        if (conv.ok && conv.value.convertedValue !== null) {
          canonicalValue = conv.value.convertedValue;
          canonicalUnit = targetUnit;
          appliedRules.push(`unit:llm:${conv.value.rationale}`);
        } else {
          flags.push('unit-conversion-failed');
          canonicalValue = null;
          canonicalUnit = v.rawUnit;
          unresolved.push({
            category: 'ontology-conflict',
            detail: `cannot convert ${v.rawUnit} to ${targetUnit}: ${conv.ok ? conv.value.rationale : conv.failure.reason}`,
            blocking: false,
          });
        }
      }
    } else if (v.value === null) {
      flags.push('value-missing');
    }

    /* Std 8 triggers: per-record flags. */
    if (metricConfidence < LOW_CONFIDENCE_THRESHOLD) flags.push('low-confidence-mapping');

    records.push({
      canonicalEntity,
      canonicalMetric,
      period: v.period,
      value: canonicalValue,
      canonicalUnit,
      rawEntity: v.entity,
      rawLabel: v.rawLabel,
      rawValue: v.value,
      rawUnit: v.rawUnit,
      sourceUrl: v.sourceUrl,
      capturedAt: v.capturedAt,
      appliedRules,
      confidence: metricConfidence,
      flags,
    });
  }

  /* Step 5 (Std 8): resolve-duplicates — detect contradictions. */
  const byKey = new Map<string, NormalizedRecord[]>();
  for (const r of records) {
    const k = `${r.canonicalEntity}::${r.canonicalMetric}::${r.period}`;
    if (!byKey.has(k)) byKey.set(k, []);
    byKey.get(k)!.push(r);
  }
  for (const [key, group] of byKey) {
    if (group.length <= 1) continue;
    const nonNull = group.filter(r => r.value !== null);
    if (nonNull.length <= 1) continue;
    const first = nonNull[0]!.value!;
    const contradiction = nonNull.some(r => Math.abs(r.value! - first) / Math.max(1, Math.abs(first)) > 0.02);
    if (contradiction) {
      for (const r of group) r.flags.push('contradictory-mapping');
      unresolved.push({
        category: 'contradictory-mapping',
        detail: `multiple values for ${key}: ${nonNull.map(r => r.value).join(', ')}`,
        blocking: false,
        context: { key, values: nonNull.map(r => ({ value: r.value, source: r.sourceUrl })) },
      });
    }
  }
  trace(ctx, 8, normalizationContract.runbook[4]!.name,
    `dedup complete; ${unresolved.filter(u => u.category === 'contradictory-mapping').length} contradiction(s)`);

  /* Std 10: drain learned rules for write-back. */
  const learnedDrain = store.drainLearned();
  const learnedRules: LearnedRule[] = learnedDrain
    .filter(e => e.by !== 'seed')
    .map(e => ({ key: e.key, value: e.canonical }));
  if (learnedDrain.length > 0) await store.save();
  trace(ctx, 10, normalizationContract.runbook[5]!.name,
    `${learnedRules.length} new rule(s) drained for write-back`);

  /* Step 6 (Std 7): validate-output. */
  const avgConfidence = records.length === 0
    ? 0
    : records.reduce((s, r) => s + r.confidence, 0) / records.length;
  const blocking = unresolved.filter(u => u.blocking).length;
  const confidence = makeConfidence(
    Math.max(0, avgConfidence - 0.1 * blocking),
    `avg per-record mapping confidence ${avgConfidence.toFixed(2)} with ${blocking} blocking issue(s)`,
  );

  if (records.length === 0) {
    return {
      ok: false,
      escalations,
      failure: failure(
        ctx,
        'unresolved-normalization',
        'No values could be normalized.',
        { input, unresolved },
        side.upstreamLineage,
      ),
    };
  }

  /* Step 7 (Std 11): handoff. */
  const lineage: Lineage = {
    sourceUrl: side.upstreamLineage.sourceUrl,
    capturedAt: nowIso(),
    effectiveAs: side.upstreamLineage.effectiveAs,
    agentVersion: AGENT_VERSION,
    upstream: Array.from(new Set([...side.upstreamLineage.upstream, ...records.map(r => r.sourceUrl)])),
  };
  const validationStatus =
    blocking > 0 ? 'review' : confidence.value < LOW_CONFIDENCE_THRESHOLD ? 'flagged' : 'passed';

  const handoff: Handoff<NormalizationOutput> = {
    fromAgent: AGENT_NAME,
    fromAgentVersion: AGENT_VERSION,
    toAgent: 'baseline.resolution',
    payload: { records, learnedRules },
    metadata: {
      analysisId: ctx.analysisId,
      capabilities: normalizationContract.capabilities,
      targetUnit,
    },
    confidence,
    validation: {
      status: validationStatus,
      checks: [
        { name: 'records-produced', passed: records.length > 0, detail: `${records.length}` },
        { name: 'no-contradictions', passed: records.every(r => !r.flags.includes('contradictory-mapping')) },
        { name: 'units-canonical', passed: targetUnit === '' || records.every(r => r.canonicalUnit === targetUnit || r.value === null) },
      ],
    },
    unresolvedIssues: unresolved,
    lineage,
    timestamp: nowIso(),
  };
  trace(ctx, 11, normalizationContract.runbook[6]!.name,
    `handoff → ${handoff.toAgent} (validation=${validationStatus} confidence=${confidence.tier})`);

  return { ok: true, handoff, escalations };
}