/**
* intake — turn a plain English question into a typed JobRequest.
*
* One small Anthropic Haiku call. Extracts entities (with aliases),
* decomposes derived metrics into their underlying source concepts
* plus a derivedMetrics spec that references a methodology_id from
* the SME library, and extracts the period. Sources default to
* ["sec-edgar"]. seedMappings are left empty — those are an SME hint
* (specific XBRL tag aliases) that intake is not asked to fabricate;
* the resolution agent + methodology library cover that path.
*
* The result is validated against jobRequestSchema before return, so
* the orchestrator only ever sees a well-typed request (Std 2).
*/
import Anthropic from '@anthropic-ai/sdk';
import type { TextBlock } from '@anthropic-ai/sdk/resources/messages.js';
import { loadLibrary, type Methodology } from '../src/intelligence/library.js';
import { recordUsage } from '../src/observability/usage.js';
import { jobRequestSchema, type JobRequest } from '../src/types.js';
const MODEL = 'claude-haiku-4-5';
/* The intake LLM only needs to pick a `methodology_id` for derived
* metrics it nominates. Restrict the digest to types that describe
* how to compute a metric — `metric_definition` is the relevant kind.
* Other types (comparison_method, insight_framework, normalization_rule)
* are not candidate methodology references for derivedMetrics. */
function inputFieldName(raw: unknown): string {
if (raw && typeof raw === 'object' && 'field' in raw) {
const f = (raw as { field: unknown }).field;
if (typeof f === 'string') return f;
}
return '?';
}
function buildMethodologyDigest(methodologies: readonly Methodology[]): string {
const candidates = methodologies.filter(m => m.type === 'metric_definition' && m.status === 'active');
if (candidates.length === 0) {
return '(no metric_definition methodologies are encoded in the library yet — set methodology to null for any derived metric you nominate.)';
}
return candidates
.map(m => {
const formula = (m.definition && typeof m.definition === 'object' && 'formula' in m.definition)
? String((m.definition as { formula: unknown }).formula)
: '(no formula recorded)';
const inputs = m.inputs.map(inputFieldName).join(', ');
return ` - methodology_id: "${m.methodology_id}"\n name: ${m.name}\n domain: ${m.domain}\n formula: ${formula}\n required input fields: ${inputs}\n triggers: ${m.applies_to.triggers.join(' | ')}`;
})
.join('\n');
}
function buildSystemPrompt(methodologies: readonly Methodology[]): string {
const digest = buildMethodologyDigest(methodologies);
return `You convert a single user question about company financial data into a structured JobRequest for the BID analytical framework. Reply with ONE JSON object and nothing else.
Output shape:
{
"analysisId": "demo-<short-slug-derived-from-question>",
"question": "<the user's question, copied verbatim>",
"entities": [
{ "id": "<canonical legal name>", "aliases": ["<ticker>", "<short name>", "<other obvious aliases>"] }
],
"targetMetrics": [
{ "key": "<snake_case_source_concept_key>", "definition": "<one-sentence definition naming what the source line item measures and where it appears in financial filings>", "unit": "<USD | ratio | percent | count | etc.>" }
],
"derivedMetrics": [
{ "key": "<snake_case_computed_metric_key>", "definition": "<one-sentence definition of the computed metric, naming its numerator and denominator or the formula>", "unit": "<ratio | percent | USD | etc.>", "methodology": "<methodology_id from the library below, or null if none matches>" }
],
"period": "<FY-YYYY | QN-YYYY | latest-annual | latest-quarter>"
}
Decomposition rules — THIS IS THE CORE OF INTAKE:
- If the question asks for a DIRECT source line item (e.g., "total revenue", "net income", "noninterest expense"), put it ONLY in targetMetrics. OMIT the derivedMetrics field entirely.
- If the question implies a DERIVED metric — anything with "efficiency", "ratio", "rate", "margin", "growth", "per", "%", "share of", "as a fraction of", etc. — DECOMPOSE it:
1. Put the underlying source concepts (numerator and denominator, or each component of a sum/diff) in targetMetrics as separate entries. These are what the Source/Extraction agent must fetch from filings.
2. Add ONE entry to derivedMetrics describing the computed metric. Set its "methodology" to a matching methodology_id from the library digest below — match on domain, formula intent, and triggers. If nothing matches, set methodology to null (downstream agents will escalate per Standard 9).
3. When you nominate a methodology, the targetMetrics keys MUST match the methodology's "required input fields" exactly so downstream can map them.
- Never put a ratio/efficiency/growth metric directly in targetMetrics. targetMetrics are always raw source concepts the agent can fetch by name from a filing.
Available methodologies (metric_definition entries from the SME library):
${digest}
Entity rules:
- Resolve obvious ticker aliases to the canonical legal name (JPM -> JPMorgan Chase & Co., BAC -> Bank of America Corporation, WFC -> Wells Fargo & Company, C -> Citigroup Inc., GS -> The Goldman Sachs Group, Inc., MS -> Morgan Stanley). Put the ticker AND the short name in aliases.
- If the user gives only a ticker, the canonical id is the legal name; the ticker goes in aliases.
Period rules:
- "FY 2024" / "fiscal 2024" / "for 2024" -> "FY-2024". "Q3 2024" -> "Q3-2024". If unspecified, use "latest-annual".
General rules:
- analysisId: lowercase, hyphen-separated, <= 40 chars, prefixed "demo-".
- Do not invent entities or metrics not implied by the question.
- Do not include sources, seedMappings, or any field outside the shape above. Do not add commentary.`;
}
function parseJsonResponse(text: string): unknown {
const cleaned = text.replace(/^```(?:json)?\s*/i, '').replace(/```\s*$/i, '').trim();
try { return JSON.parse(cleaned); } catch { /* fall through */ }
const m = cleaned.match(/\{[\s\S]*\}/);
if (!m) return null;
try { return JSON.parse(m[0]); } catch { return null; }
}
export interface IntakeResult {
jobRequest: JobRequest;
modelUsed: string;
inputTokens: number;
outputTokens: number;
methodologiesShown: number;
}
export async function buildJobRequestFromQuestion(question: string): Promise<IntakeResult> {
const apiKey = process.env.ANTHROPIC_API_KEY;
if (!apiKey) {
throw new Error('ANTHROPIC_API_KEY is not set. Intake uses a small Claude Haiku call and will not fake the run (Std 12).');
}
const client = new Anthropic({ apiKey });
const library = await loadLibrary();
const methodologies = library.filter(m => m.type === 'metric_definition' && m.status === 'active');
const systemPrompt = buildSystemPrompt(library);
const resp = await client.messages.create({
model: MODEL,
max_tokens: 1000,
system: systemPrompt,
messages: [{ role: 'user', content: question }],
});
recordUsage('intake', MODEL, resp.usage.input_tokens, resp.usage.output_tokens);
const textBlock = resp.content.find((b): b is TextBlock => b.type === 'text');
const text = textBlock ? textBlock.text : '';
if (!text.trim()) {
throw new Error('Intake LLM returned an empty response.');
}
const raw = parseJsonResponse(text);
if (!raw || typeof raw !== 'object') {
throw new Error(`Intake LLM response was not valid JSON. Raw:\n${text}`);
}
const candidate = {
...(raw as Record<string, unknown>),
sources: ['sec-edgar'],
};
const parsed = jobRequestSchema.safeParse(candidate);
if (!parsed.success) {
throw new Error(
`Intake produced a JobRequest that failed schema validation:\n` +
JSON.stringify(parsed.error.format(), null, 2) +
`\n\nRaw LLM output:\n${text}`,
);
}
return {
jobRequest: parsed.data,
modelUsed: MODEL,
inputTokens: resp.usage.input_tokens,
outputTokens: resp.usage.output_tokens,
methodologiesShown: methodologies.length,
};
}