/**
* Source/Extraction agent — runtime entry point.
*
* The agent walks the matrix-defined runbook (Std 6). Steps 1 (validate
* input), 4 (structure/stamp provenance), 5 (validate output), and 6
* (handoff) are deterministic code in this file. Steps 2 (retrieve)
* and 3 (parse-and-extract) are delegated to the LLM through its
* tool-use loop in llm.ts — the model decides which SEC tools to call
* and how often (Std 3: judgment is explicit). Every tool invocation
* lands in the audit trail with Std 5 ("methods & tools").
*
* Std 12: if ANTHROPIC_API_KEY is missing, llm.ts returns a
* `needs-api-key` failure and the agent escalates instead of faking a
* run.
*/
import {
type AgentResult,
type HITLEscalation,
LOW_CONFIDENCE_THRESHOLD,
makeConfidence,
} from '../../../standards.js';
import type {
ExecutionContext,
FailureObject,
Handoff,
Lineage,
UnresolvedIssue,
} from '../../../types.js';
import { nowIso } from '../../../types.js';
import {
AGENT_NAME,
AGENT_VERSION,
sourceExtractionContract,
} from './matrix.js';
import {
type ComparabilityNote,
type ExtractedValue,
type SourceExtractionInput,
type SourceExtractionOutput,
sourceExtractionInputSchema,
} from './schema.js';
import {
extractWithSecTools,
MODEL_NAME,
SEC_TOOL_COUNT,
SEC_TOOL_NAMES,
type ToolCallTrace,
} from './llm.js';
export { sourceExtractionContract } from './matrix.js';
export type { SourceExtractionOutput } from './schema.js';
function trace(ctx: ExecutionContext, standard: number, step: string, detail: string): void {
ctx.trace.push({ agent: AGENT_NAME, standard, step, detail, at: nowIso() });
// eslint-disable-next-line no-console
console.log(` [${AGENT_NAME}][Std ${standard}] ${step} — ${detail}`);
}
function emptyLineage(): Lineage {
return { sourceUrl: null, capturedAt: nowIso(), effectiveAs: null, agentVersion: AGENT_VERSION, upstream: [] };
}
function failure(
ctx: ExecutionContext,
category: FailureObject['category'],
reason: string,
context: Record<string, unknown>,
lineage: Lineage = emptyLineage(),
): FailureObject {
return {
agent: AGENT_NAME,
agentVersion: AGENT_VERSION,
category,
reason,
context,
lineage,
attempts: ctx.retries,
recursionDepth: ctx.recursionDepth,
occurredAt: nowIso(),
};
}
export async function runSourceExtraction(
rawInput: unknown,
ctx: ExecutionContext,
): Promise<AgentResult<SourceExtractionOutput>> {
/* Step 1 (Std 2): validate-input. */
const parsed = sourceExtractionInputSchema.safeParse(rawInput);
if (!parsed.success) {
return {
ok: false,
escalations: [],
failure: failure(ctx, 'invalid-input', 'JobRequest failed schema validation.', { issues: parsed.error.issues }),
};
}
const input: SourceExtractionInput = parsed.data;
trace(ctx, 2, sourceExtractionContract.runbook[0]!.name,
`validated request: ${input.entities.length} entity(ies), ${input.targetMetrics.length} metric(s), period=${input.period}, sources=[${input.sources.join(',')}]`);
const unresolved: UnresolvedIssue[] = [];
const escalations: HITLEscalation[] = [];
/* Step 2 (Std 5): retrieve — delegated to the LLM via SEC tool use. */
trace(ctx, 5, sourceExtractionContract.runbook[1]!.name,
`delegating retrieval to LLM (${MODEL_NAME}) with ${SEC_TOOL_COUNT} SEC tool(s) available: [${SEC_TOOL_NAMES.join(', ')}]`);
/* Step 3 (Std 3): parse-and-extract — the same tool-use loop drives extraction. */
trace(ctx, 3, sourceExtractionContract.runbook[2]!.name,
`starting tool-use extraction over ${input.entities.length} entity(ies) × ${input.targetMetrics.length} metric(s)`);
const onToolCall = (t: ToolCallTrace): void => {
const inputSummary = summarizeInput(t.toolName, t.input);
ctx.trace.push({
agent: AGENT_NAME,
standard: 5,
step: 'tool-call',
detail: `${t.toolName}(${inputSummary}) → ${t.ok ? t.resultSummary : `ERROR: ${t.errorMessage ?? 'unknown'}`}`,
at: t.at,
});
// eslint-disable-next-line no-console
console.log(` [${AGENT_NAME}][Std 5] tool-call — ${t.toolName}(${inputSummary}) → ${t.ok ? t.resultSummary : `ERROR: ${t.errorMessage}`}`);
};
const llm = await extractWithSecTools(
{
analysisId: ctx.analysisId,
question: input.question,
entities: input.entities,
targetMetrics: input.targetMetrics,
period: input.period,
sources: input.sources,
},
onToolCall,
);
if (!llm.ok) {
const isKey = llm.failure.category === 'needs-api-key';
escalations.push({
agent: AGENT_NAME,
reason: isKey ? 'critical-validation-failure' : 'critical-validation-failure',
failureContext: llm.failure.reason,
lineage: emptyLineage(),
validation: {
status: 'review',
confidence: makeConfidence(0, 'LLM unavailable or invalid response'),
checks: [{ name: 'llm-available', passed: false, detail: llm.failure.hint }],
},
recommendedReviewer: isKey ? 'engineer' : 'data-steward',
raisedAt: nowIso(),
});
return {
ok: false,
escalations,
failure: failure(
ctx,
isKey ? 'tool-unavailable' : 'unrecoverable-extraction',
llm.failure.reason,
{ llmFailure: llm.failure, input },
),
};
}
/* Step 4 (Std 4): structure — stamp provenance, map to ExtractedValue. */
const extracted: ExtractedValue[] = llm.value.hits.map(h => {
const flags: string[] = [];
if (h.value === null) flags.push('value-missing');
if (h.rawUnit === null) flags.push('unit-missing');
if (h.confidence < LOW_CONFIDENCE_THRESHOLD) flags.push('low-confidence-extraction');
if (!h.sourceUrl) flags.push('missing-source-url');
return {
entity: h.entity,
metricKey: h.metricKey,
period: h.period,
rawLabel: h.rawLabel,
value: h.value,
rawUnit: h.rawUnit,
snippet: h.snippet,
sourceUrl: h.sourceUrl || 'sec-edgar',
sourceConnector: 'sec-edgar',
contentType: 'application/json',
capturedAt: nowIso(),
confidence: h.confidence,
origin: 'llm',
flags,
};
});
const comparabilityNotes: ComparabilityNote[] = llm.value.comparabilityNotes.map(n => ({
entities: [...n.entities],
detail: n.detail,
}));
trace(ctx, 4, sourceExtractionContract.runbook[3]!.name,
`structured ${extracted.length} value(s) with provenance from ${llm.value.toolCalls.length} tool call(s)`);
/* Step 5 (Std 7): validate-output — completeness, lineage, confidence, validation status. */
const requested = input.entities.length * input.targetMetrics.length;
const found = extracted.filter(v => v.value !== null).length;
const coverage = requested === 0 ? 0 : found / requested;
/* Std 8: per-metric mixed-unit triggers. */
const unitsByMetric = new Map<string, Set<string>>();
for (const v of extracted) {
if (!v.rawUnit) continue;
if (!unitsByMetric.has(v.metricKey)) unitsByMetric.set(v.metricKey, new Set());
unitsByMetric.get(v.metricKey)!.add(v.rawUnit);
}
for (const [metric, units] of unitsByMetric) {
if (units.size > 1) {
comparabilityNotes.push({
entities: Array.from(new Set(extracted.filter(v => v.metricKey === metric).map(v => v.entity))),
detail: `metric "${metric}" appears in mixed units: ${[...units].join(', ')} — normalize before comparing`,
});
}
}
/* Std 8: duplicate detection. */
const byKey = new Map<string, ExtractedValue[]>();
for (const v of extracted) {
const k = `${v.entity}::${v.metricKey}::${v.period}`;
if (!byKey.has(k)) byKey.set(k, []);
byKey.get(k)!.push(v);
}
for (const [k, group] of byKey) {
if (group.length > 1) {
unresolved.push({
category: 'duplicate-records',
detail: `${group.length} candidate values for ${k}`,
blocking: false,
context: { key: k },
});
}
}
/* Std 8: missing-data triggers — flag entity/metric combos the LLM didn't return. */
for (const e of input.entities) {
for (const m of input.targetMetrics) {
const has = extracted.some(v => v.entity === e.id && v.metricKey === m.key);
if (!has) {
unresolved.push({
category: 'missing-data',
detail: `no extracted value for entity="${e.id}" metric="${m.key}" period="${input.period}"`,
blocking: false,
});
}
}
}
const blocking = unresolved.filter(u => u.blocking).length;
const confidence = makeConfidence(
Math.max(0, coverage - 0.2 * blocking),
`coverage ${(coverage * 100).toFixed(0)}% (${found}/${requested}) with ${blocking} blocking issue(s)`,
);
trace(ctx, 7, sourceExtractionContract.runbook[4]!.name,
`validation: coverage=${(coverage * 100).toFixed(0)}% blocking=${blocking} confidence=${confidence.tier}`);
if (extracted.length === 0) {
return {
ok: false,
escalations,
failure: failure(
ctx,
'unrecoverable-extraction',
'LLM tool-use loop returned no values after completing the runbook.',
{ input, unresolved, toolCalls: llm.value.toolCalls },
),
};
}
/* Step 6 (Std 11): handoff. */
const lineage: Lineage = {
sourceUrl: extracted[0]?.sourceUrl ?? null,
capturedAt: nowIso(),
effectiveAs: null,
agentVersion: AGENT_VERSION,
upstream: Array.from(new Set(extracted.map(v => v.sourceUrl))),
};
const validationStatus =
blocking > 0 ? 'review' : confidence.value < LOW_CONFIDENCE_THRESHOLD ? 'flagged' : 'passed';
const handoff: Handoff<SourceExtractionOutput> = {
fromAgent: AGENT_NAME,
fromAgentVersion: AGENT_VERSION,
toAgent: 'baseline.normalization',
payload: { values: extracted, comparabilityNotes },
metadata: {
analysisId: ctx.analysisId,
capabilities: sourceExtractionContract.capabilities,
requestedMetrics: input.targetMetrics.map(m => m.key),
toolCallCount: llm.value.toolCalls.length,
toolCalls: llm.value.toolCalls.map(t => ({
toolName: t.toolName,
ok: t.ok,
input: t.input,
resultSummary: t.resultSummary,
errorMessage: t.errorMessage,
at: t.at,
})),
},
confidence,
validation: {
status: validationStatus,
checks: [
{ name: 'coverage', passed: coverage >= 0.5, detail: `${found}/${requested}` },
{ name: 'no-blocking-issues', passed: blocking === 0, detail: `${blocking} blocking` },
{ name: 'provenance-stamped', passed: extracted.every(v => v.sourceUrl.length > 0) },
],
},
unresolvedIssues: unresolved,
lineage,
timestamp: nowIso(),
};
trace(ctx, 11, sourceExtractionContract.runbook[5]!.name,
`handoff → ${handoff.toAgent} (validation=${validationStatus} confidence=${confidence.tier})`);
return { ok: true, handoff, escalations };
}
function summarizeInput(toolName: string, input: Record<string, unknown>): string {
if (toolName === 'sec_edgar_companies') return `searchTerms="${input.searchTerms ?? ''}"`;
if (toolName === 'sec_financials') {
const c = input.concepts ? `, concepts="${input.concepts}"` : '';
return `cik="${input.cik ?? ''}"${c}`;
}
if (toolName === 'sec_submissions') {
const f = input.filingTypes ? `, filingTypes="${input.filingTypes}"` : '';
return `cik="${input.cik ?? ''}"${f}`;
}
if (toolName === 'sec_filing_document') {
return `documentUrl="${String(input.documentUrl ?? '').slice(0, 80)}…"`;
}
return Object.entries(input)
.map(([k, v]) => `${k}=${JSON.stringify(v)}`)
.join(', ');
}