Code · src/tools/retrieval/connectors/sec-edgar.ts

src/tools/retrieval/connectors/sec-edgar.ts 27,295 bytes · typescript
/**
 * SEC EDGAR retrieval connector.
 *
 * The connector ships in two shapes:
 *
 *   1. Four standalone functions —
 *        secEdgarCompanies(searchTerms)
 *        secFinancials(cik, concepts?)
 *        secSubmissions(cik, filingTypes?)
 *        secFilingDocument(documentUrl, maxChars?)
 *      These are the surfaces agents call (typically as Anthropic
 *      tools via SEC_TOOLS / executeSecTool below). They return
 *      structured JSON, not raw bytes, because their job is to make
 *      EDGAR usable for an LLM — not to dump untouched HTML.
 *
 *   2. SecEdgarConnector — a RetrievalConnector that registers the
 *      `sec-edgar` source name with the dispatcher so JobRequest
 *      `sources: ["sec-edgar"]` is a known source. Its `fetch()` is a
 *      thin wrapper around secFinancials, kept for legacy
 *      compatibility with the original dispatchFetch path.
 *
 * All outbound requests share one rate limiter (SEC's published public
 * limit is 10 req/s) and the User-Agent
 * `MR mitchell.roy@sia-partners.com`. Every failure is translated to
 * RetrievalError so the dispatcher hands callers a structured
 * DispatchResult (Std 12).
 */

import {
  RetrievalError,
  type FetchParams,
  type RawPayload,
  type RetrievalConnector,
} from '../interface.js';
import { httpGet, HttpError } from '../http-client.js';
import { RateLimiter } from '../rate-limiter.js';

/**
 * Shared User-Agent + rate limiter, exported so sibling SEC connector
 * modules (e.g. sec-edgar-filings.ts) can reuse them instead of
 * standing up parallel limiters that would silently double the
 * outbound rate against SEC.
 */
export const SEC_USER_AGENT = 'MR mitchell.roy@sia-partners.com';
export const SEC_HEADERS_JSON = { 'User-Agent': SEC_USER_AGENT, Accept: 'application/json' } as const;
export const SEC_HEADERS_ANY = { 'User-Agent': SEC_USER_AGENT, Accept: '*/*' } as const;

const TICKERS_URL = 'https://www.sec.gov/files/company_tickers.json';
const COMPANY_FACTS_BASE = 'https://data.sec.gov/api/xbrl/companyfacts';
const SUBMISSIONS_BASE = 'https://data.sec.gov/submissions';

const COMPANY_FACTS_MAX_BYTES = 64 * 1024 * 1024;
const SUBMISSIONS_MAX_BYTES = 8 * 1024 * 1024;
const TICKERS_MAX_BYTES = 8 * 1024 * 1024;
const FILING_DOC_MAX_BYTES = 32 * 1024 * 1024;

const DEFAULT_FILING_TRUNCATE_CHARS = 50_000;

export const secLimiter = new RateLimiter({ requestsPerSecond: 10, burstSize: 10 });

const limiter = secLimiter;
const SEC_HEADERS = SEC_HEADERS_JSON;

/* ------------------------------------------------------------------ *
 * Tool 1 — secEdgarCompanies(searchTerms)
 * Resolves a free-form search string (one or more tickers / names,
 * comma-separated) to {cik, ticker, name} records. Backed by
 * company_tickers.json, cached for the process lifetime.
 * ------------------------------------------------------------------ */

export interface SecCompanyMatch {
  readonly cik: string;
  readonly ticker: string;
  readonly name: string;
}

interface TickerEntry {
  readonly cik: string;
  readonly ticker: string;
  readonly nameUpper: string;
  readonly name: string;
}

let tickerIndex: readonly TickerEntry[] | null = null;
let tickerIndexPromise: Promise<readonly TickerEntry[]> | null = null;

async function loadTickerIndex(): Promise<readonly TickerEntry[]> {
  if (tickerIndex) return tickerIndex;
  if (tickerIndexPromise) return tickerIndexPromise;
  tickerIndexPromise = (async () => {
    await limiter.acquire();
    let res;
    try {
      res = await httpGet(TICKERS_URL, { headers: SEC_HEADERS, maxBodyBytes: TICKERS_MAX_BYTES });
    } catch (err) {
      tickerIndexPromise = null;
      throw translateHttpError(err, TICKERS_URL);
    }
    let parsed: unknown;
    try {
      parsed = JSON.parse(res.body);
    } catch (err) {
      tickerIndexPromise = null;
      throw new RetrievalError(
        'internal',
        `SEC company_tickers.json invalid JSON: ${err instanceof Error ? err.message : String(err)}`,
        { url: TICKERS_URL },
      );
    }
    const out: TickerEntry[] = [];
    if (parsed && typeof parsed === 'object') {
      for (const raw of Object.values(parsed as Record<string, unknown>)) {
        if (!raw || typeof raw !== 'object') continue;
        const e = raw as { cik_str?: unknown; ticker?: unknown; title?: unknown };
        const cikRaw =
          typeof e.cik_str === 'number'
            ? String(e.cik_str)
            : typeof e.cik_str === 'string'
              ? e.cik_str
              : '';
        const cik = parseCik(cikRaw);
        const ticker = typeof e.ticker === 'string' ? e.ticker.trim().toUpperCase() : '';
        const name = typeof e.title === 'string' ? e.title.trim() : '';
        if (!cik || !ticker) continue;
        out.push({ cik, ticker, nameUpper: name.toUpperCase(), name });
      }
    }
    if (out.length === 0) {
      tickerIndexPromise = null;
      throw new RetrievalError('no-content', 'company_tickers.json contained no resolvable records.', {
        url: TICKERS_URL,
      });
    }
    tickerIndex = out;
    return out;
  })();
  return tickerIndexPromise;
}

export async function secEdgarCompanies(searchTerms: string): Promise<SecCompanyMatch[]> {
  if (typeof searchTerms !== 'string' || !searchTerms.trim()) {
    throw new RetrievalError('invalid-request', 'secEdgarCompanies: searchTerms must be a non-empty string.');
  }
  const terms = searchTerms
    .split(/[,;\n]/)
    .map(t => t.trim())
    .filter(t => t.length > 0);
  if (terms.length === 0) {
    throw new RetrievalError('invalid-request', 'secEdgarCompanies: no usable search terms after splitting.');
  }
  const index = await loadTickerIndex();
  const seenCiks = new Set<string>();
  const matches: SecCompanyMatch[] = [];
  for (const term of terms) {
    const direct = parseCik(term);
    if (direct) {
      if (seenCiks.has(direct)) continue;
      const hit = index.find(r => r.cik === direct);
      if (hit) {
        seenCiks.add(direct);
        matches.push({ cik: hit.cik, ticker: hit.ticker, name: hit.name });
      } else {
        seenCiks.add(direct);
        matches.push({ cik: direct, ticker: '', name: '' });
      }
      continue;
    }
    const norm = term.toUpperCase();
    const exact = index.find(r => r.ticker === norm || r.nameUpper === norm);
    if (exact && !seenCiks.has(exact.cik)) {
      seenCiks.add(exact.cik);
      matches.push({ cik: exact.cik, ticker: exact.ticker, name: exact.name });
      continue;
    }
    for (const r of index) {
      if (r.ticker === norm || r.nameUpper.includes(norm)) {
        if (seenCiks.has(r.cik)) continue;
        seenCiks.add(r.cik);
        matches.push({ cik: r.cik, ticker: r.ticker, name: r.name });
        if (matches.length >= terms.length * 8) break;
      }
    }
  }
  return matches;
}

/* ------------------------------------------------------------------ *
 * Tool 2 — secFinancials(cik, concepts?)
 *
 * Returns the company-facts JSON for a CIK. Connector design
 * principle (Std 5 — narrow-first): a tool that can return a large
 * response body MUST default to a summary describing the available
 * data elements; full data is returned only when the caller names
 * specific elements. Here that means:
 *   - secFinancials(cik)                   → summary: { entity, availableConcepts, conceptCountsByTaxonomy, summary: true } — facts omitted.
 *   - secFinancials(cik, "Revenues,Assets") → targeted: full filtered facts time series for the named concepts.
 * This keeps the LLM from accidentally pulling 900+ XBRL concepts
 * into context when it only needs two.
 * ------------------------------------------------------------------ */

export interface SecCompanyFacts {
  readonly cik: string;
  readonly entityName: string;
  /** Omitted in summary mode (no `concepts` argument). Populated only when
   *  the caller names specific concepts. */
  readonly facts?: Record<string, Record<string, unknown>>;
  /** True when this is a summary response (no `concepts` was specified).
   *  Use the listed availableConcepts to drive a targeted re-call. */
  readonly summary?: boolean;
  readonly availableConcepts?: readonly string[];
  /** Per-taxonomy count of available concepts, e.g. {"us-gaap": 870, "dei": 35}. */
  readonly conceptCountsByTaxonomy?: Readonly<Record<string, number>>;
  readonly conceptsFilter?: readonly string[];
  readonly sourceUrl: string;
  readonly capturedAt: string;
}

export async function secFinancials(cik: string, concepts?: string): Promise<SecCompanyFacts> {
  const padded = parseCik(cik);
  if (!padded) {
    throw new RetrievalError('invalid-request', `secFinancials: not a valid CIK: "${cik}"`);
  }
  const url = `${COMPANY_FACTS_BASE}/CIK${padded}.json`;
  await limiter.acquire();
  let res;
  try {
    res = await httpGet(url, { headers: SEC_HEADERS, maxBodyBytes: COMPANY_FACTS_MAX_BYTES });
  } catch (err) {
    throw translateHttpError(err, url);
  }
  let parsed: { cik?: unknown; entityName?: unknown; facts?: unknown };
  try {
    parsed = JSON.parse(res.body);
  } catch (err) {
    throw new RetrievalError(
      'internal',
      `companyfacts JSON parse failed: ${err instanceof Error ? err.message : String(err)}`,
      { url },
    );
  }
  const entityName = typeof parsed.entityName === 'string' ? parsed.entityName : '';
  const allFacts = parsed.facts && typeof parsed.facts === 'object'
    ? (parsed.facts as Record<string, Record<string, unknown>>)
    : {};

  const conceptList = (concepts ?? '')
    .split(',')
    .map(s => s.trim())
    .filter(s => s.length > 0);

  const availableConcepts = collectConceptNames(allFacts);
  const conceptCountsByTaxonomy: Record<string, number> = {};
  for (const [taxonomy, conceptMap] of Object.entries(allFacts)) {
    if (conceptMap && typeof conceptMap === 'object') {
      conceptCountsByTaxonomy[taxonomy] = Object.keys(conceptMap).length;
    }
  }

  /* Summary mode — narrow-first connector design (Std 5). The full
   * facts body is *not* returned; instead the caller gets the menu
   * of available concepts and must re-call with explicit names. */
  if (conceptList.length === 0) {
    return {
      cik: padded,
      entityName,
      summary: true,
      availableConcepts,
      conceptCountsByTaxonomy,
      sourceUrl: res.url,
      capturedAt: new Date().toISOString(),
    };
  }

  /* Targeted mode — the caller named specific concepts; return the
   * full filtered time series for each one. */
  const filtered: Record<string, Record<string, unknown>> = {};
  const wanted = new Set(conceptList.map(c => c.toLowerCase()));
  for (const [taxonomy, conceptMap] of Object.entries(allFacts)) {
    if (!conceptMap || typeof conceptMap !== 'object') continue;
    const subset: Record<string, unknown> = {};
    for (const [name, val] of Object.entries(conceptMap)) {
      if (wanted.has(name.toLowerCase())) subset[name] = val;
    }
    if (Object.keys(subset).length > 0) filtered[taxonomy] = subset;
  }

  return {
    cik: padded,
    entityName,
    facts: filtered,
    availableConcepts,
    conceptCountsByTaxonomy,
    conceptsFilter: conceptList,
    sourceUrl: res.url,
    capturedAt: new Date().toISOString(),
  };
}

function collectConceptNames(allFacts: Record<string, Record<string, unknown>>): string[] {
  const set = new Set<string>();
  for (const conceptMap of Object.values(allFacts)) {
    if (!conceptMap || typeof conceptMap !== 'object') continue;
    for (const name of Object.keys(conceptMap)) set.add(name);
  }
  return [...set].sort();
}

/* ------------------------------------------------------------------ *
 * Tool 3 — secSubmissions(cik, filingTypes?)
 * Returns the company's filing history (recent filings list),
 * optionally narrowed to specific filing types (e.g. "10-K,10-Q").
 * ------------------------------------------------------------------ */

export interface SecFilingHit {
  readonly accessionNumber: string;
  readonly filingDate: string;
  readonly reportDate: string;
  readonly form: string;
  readonly primaryDocument: string;
  readonly primaryDocumentUrl: string;
}

export interface SecSubmissions {
  readonly cik: string;
  readonly entityName: string;
  readonly tickers: readonly string[];
  readonly sic: string;
  readonly sicDescription: string;
  readonly filings: readonly SecFilingHit[];
  readonly filingTypesFilter?: readonly string[];
  readonly sourceUrl: string;
  readonly capturedAt: string;
}

export async function secSubmissions(cik: string, filingTypes?: string): Promise<SecSubmissions> {
  const padded = parseCik(cik);
  if (!padded) {
    throw new RetrievalError('invalid-request', `secSubmissions: not a valid CIK: "${cik}"`);
  }
  const url = `${SUBMISSIONS_BASE}/CIK${padded}.json`;
  await limiter.acquire();
  let res;
  try {
    res = await httpGet(url, { headers: SEC_HEADERS, maxBodyBytes: SUBMISSIONS_MAX_BYTES });
  } catch (err) {
    throw translateHttpError(err, url);
  }
  let parsed: Record<string, unknown>;
  try {
    parsed = JSON.parse(res.body) as Record<string, unknown>;
  } catch (err) {
    throw new RetrievalError(
      'internal',
      `submissions JSON parse failed: ${err instanceof Error ? err.message : String(err)}`,
      { url },
    );
  }

  const entityName = typeof parsed.name === 'string' ? parsed.name : '';
  const tickersRaw = Array.isArray(parsed.tickers) ? parsed.tickers : [];
  const tickers = tickersRaw.filter((t): t is string => typeof t === 'string');
  const sic = typeof parsed.sic === 'string' ? parsed.sic : '';
  const sicDescription = typeof parsed.sicDescription === 'string' ? parsed.sicDescription : '';

  const recent = (parsed.filings as Record<string, unknown> | undefined)?.recent as
    | Record<string, unknown>
    | undefined;
  const filings: SecFilingHit[] = [];
  if (recent) {
    const accs = pickStrings(recent.accessionNumber);
    const dates = pickStrings(recent.filingDate);
    const reports = pickStrings(recent.reportDate);
    const forms = pickStrings(recent.form);
    const docs = pickStrings(recent.primaryDocument);
    const n = Math.min(accs.length, dates.length, forms.length, docs.length, reports.length);
    for (let i = 0; i < n; i++) {
      const accession = accs[i]!;
      const accNoDash = accession.replace(/-/g, '');
      const primary = docs[i]!;
      const primaryUrl = primary
        ? `https://www.sec.gov/Archives/edgar/data/${parseInt(padded, 10)}/${accNoDash}/${primary}`
        : '';
      filings.push({
        accessionNumber: accession,
        filingDate: dates[i]!,
        reportDate: reports[i]!,
        form: forms[i]!,
        primaryDocument: primary,
        primaryDocumentUrl: primaryUrl,
      });
    }
  }

  const formList = (filingTypes ?? '')
    .split(',')
    .map(s => s.trim().toUpperCase())
    .filter(s => s.length > 0);
  const filtered =
    formList.length === 0
      ? filings
      : filings.filter(f => formList.includes(f.form.toUpperCase()));

  return {
    cik: padded,
    entityName,
    tickers,
    sic,
    sicDescription,
    filings: filtered,
    filingTypesFilter: formList.length > 0 ? formList : undefined,
    sourceUrl: res.url,
    capturedAt: new Date().toISOString(),
  };
}

function pickStrings(v: unknown): readonly string[] {
  if (!Array.isArray(v)) return [];
  return v.map(x => (typeof x === 'string' ? x : x == null ? '' : String(x)));
}

/* ------------------------------------------------------------------ *
 * Tool 4 — secFilingDocument(documentUrl, maxChars?)
 * Downloads a filing document, strips HTML tags + entities, and
 * truncates at maxChars. URL is whitelisted to sec.gov / data.sec.gov.
 * ------------------------------------------------------------------ */

export interface SecFilingDocument {
  readonly url: string;
  readonly contentType: string;
  readonly text: string;
  readonly originalBytes: number;
  readonly truncated: boolean;
  readonly capturedAt: string;
}

export async function secFilingDocument(documentUrl: string, maxChars?: number): Promise<SecFilingDocument> {
  if (typeof documentUrl !== 'string' || !documentUrl) {
    throw new RetrievalError('invalid-request', 'secFilingDocument: documentUrl is required.');
  }
  let parsedUrl: URL;
  try {
    parsedUrl = new URL(documentUrl);
  } catch {
    throw new RetrievalError('invalid-request', `secFilingDocument: invalid URL "${documentUrl}".`);
  }
  if (!/(^|\.)sec\.gov$/i.test(parsedUrl.hostname)) {
    throw new RetrievalError(
      'invalid-request',
      `secFilingDocument: refusing non-SEC host "${parsedUrl.hostname}". Only sec.gov hosts are allowed.`,
    );
  }
  const limit = clampChars(maxChars);
  await limiter.acquire();
  let res;
  try {
    res = await httpGet(documentUrl, {
      headers: { 'User-Agent': SEC_USER_AGENT, Accept: '*/*' },
      maxBodyBytes: FILING_DOC_MAX_BYTES,
    });
  } catch (err) {
    throw translateHttpError(err, documentUrl);
  }
  const originalBytes = res.body.length;
  const stripped = stripHtml(res.body);
  const truncated = stripped.length > limit;
  const text = truncated ? stripped.slice(0, limit) : stripped;
  return {
    url: res.url,
    contentType: res.contentType,
    text,
    originalBytes,
    truncated,
    capturedAt: new Date().toISOString(),
  };
}

function clampChars(n?: number): number {
  if (typeof n !== 'number' || !Number.isFinite(n) || n <= 0) return DEFAULT_FILING_TRUNCATE_CHARS;
  return Math.min(Math.floor(n), 400_000);
}

export function stripHtml(input: string): string {
  return input
    .replace(/<script[\s\S]*?<\/script>/gi, ' ')
    .replace(/<style[\s\S]*?<\/style>/gi, ' ')
    .replace(/<!--[\s\S]*?-->/g, ' ')
    .replace(/<[^>]+>/g, ' ')
    .replace(/&nbsp;/gi, ' ')
    .replace(/&amp;/gi, '&')
    .replace(/&lt;/gi, '<')
    .replace(/&gt;/gi, '>')
    .replace(/&quot;/gi, '"')
    .replace(/&#39;/gi, "'")
    .replace(/&[a-z]+;/gi, ' ')
    .replace(/[ \t\r\f\v]+/g, ' ')
    .replace(/\s*\n\s*/g, '\n')
    .trim();
}

/* ------------------------------------------------------------------ *
 * Anthropic tool descriptors. Pulled in by agents that want to expose
 * SEC retrieval as LLM tools. Names use snake_case per Anthropic
 * convention; the executor maps them back to the camelCase functions
 * above.
 * ------------------------------------------------------------------ */

export interface SecToolDescriptor {
  readonly name: string;
  readonly description: string;
  readonly input_schema: {
    readonly type: 'object';
    readonly properties: Record<string, { type: string; description: string }>;
    readonly required: readonly string[];
  };
}

export const SEC_TOOLS: readonly SecToolDescriptor[] = [
  {
    name: 'sec_edgar_companies',
    description:
      'Resolve company tickers or names to SEC CIK numbers. Accepts a comma-separated search string ' +
      '(e.g. "JPM, Bank of America, GS"). Returns an array of {cik, ticker, name} matches. ' +
      'Use this FIRST to obtain the CIK before calling sec_financials or sec_submissions.',
    input_schema: {
      type: 'object',
      properties: {
        searchTerms: {
          type: 'string',
          description: 'One or more tickers / company names / CIKs, comma- or newline-separated.',
        },
      },
      required: ['searchTerms'],
    },
  },
  {
    name: 'sec_financials',
    description:
      'Fetch XBRL company facts from SEC EDGAR for a CIK. Two modes:\n' +
      ' - Summary mode (no `concepts` arg): returns ONLY the entity name and the list of available ' +
      'concept names plus per-taxonomy counts; the facts body is omitted. Use this to discover which ' +
      'concepts a company reports, then re-call in targeted mode.\n' +
      ' - Targeted mode (with `concepts`): returns the full time series (values, periods, units, ' +
      'filing accessions) for the named concepts only.\n' +
      'When the JobRequest already names the concepts you need (e.g. via targetMetrics or a ' +
      'methodology\'s required input fields), call targeted mode directly. Do not call summary mode ' +
      'first if you already know the concept names.',
    input_schema: {
      type: 'object',
      properties: {
        cik: {
          type: 'string',
          description: '10-digit padded CIK (or any numeric form; will be padded). Required.',
        },
        concepts: {
          type: 'string',
          description:
            'Comma-separated XBRL concept names to fetch (e.g. "Revenues,OperatingExpenses"). ' +
            'Omit to receive only the summary (entity + available concept names).',
        },
      },
      required: ['cik'],
    },
  },
  {
    name: 'sec_submissions',
    description:
      'Fetch a company\'s filing history (recent filings) for a CIK from SEC EDGAR. ' +
      'Optionally filter by a comma-separated list of filing types (e.g. "10-K,10-Q,8-K"). ' +
      'Returns {accessionNumber, filingDate, reportDate, form, primaryDocument, primaryDocumentUrl} per filing.',
    input_schema: {
      type: 'object',
      properties: {
        cik: { type: 'string', description: '10-digit padded CIK. Required.' },
        filingTypes: {
          type: 'string',
          description: 'Optional comma-separated filing types to filter (e.g. "10-K,10-Q").',
        },
      },
      required: ['cik'],
    },
  },
  {
    name: 'sec_filing_document',
    description:
      'Download a SEC filing document (10-K, 10-Q, 8-K HTML body) by its primaryDocumentUrl. ' +
      'Strips HTML to plain text and truncates at maxChars (default 50,000). ' +
      'Only sec.gov URLs are allowed.',
    input_schema: {
      type: 'object',
      properties: {
        documentUrl: { type: 'string', description: 'A primaryDocumentUrl from sec_submissions.' },
        maxChars: {
          type: 'number',
          description: 'Maximum characters of stripped text to return (default 50,000, max 400,000).',
        },
      },
      required: ['documentUrl'],
    },
  },
];

export interface SecToolResult {
  readonly ok: boolean;
  readonly result?: unknown;
  readonly error?: { readonly category: string; readonly message: string };
}

/** Execute a tool by Anthropic tool name. Returns a JSON-safe object. */
export async function executeSecTool(name: string, rawInput: unknown): Promise<SecToolResult> {
  const input = (rawInput && typeof rawInput === 'object') ? (rawInput as Record<string, unknown>) : {};
  try {
    switch (name) {
      case 'sec_edgar_companies': {
        const searchTerms = typeof input.searchTerms === 'string' ? input.searchTerms : '';
        const result = await secEdgarCompanies(searchTerms);
        return { ok: true, result };
      }
      case 'sec_financials': {
        const cik = typeof input.cik === 'string' ? input.cik : '';
        const concepts = typeof input.concepts === 'string' ? input.concepts : undefined;
        const result = await secFinancials(cik, concepts);
        return { ok: true, result };
      }
      case 'sec_submissions': {
        const cik = typeof input.cik === 'string' ? input.cik : '';
        const filingTypes = typeof input.filingTypes === 'string' ? input.filingTypes : undefined;
        const result = await secSubmissions(cik, filingTypes);
        return { ok: true, result };
      }
      case 'sec_filing_document': {
        const documentUrl = typeof input.documentUrl === 'string' ? input.documentUrl : '';
        const maxChars = typeof input.maxChars === 'number' ? input.maxChars : undefined;
        const result = await secFilingDocument(documentUrl, maxChars);
        return { ok: true, result };
      }
      default:
        return { ok: false, error: { category: 'unknown-tool', message: `unknown SEC tool "${name}"` } };
    }
  } catch (err) {
    if (err instanceof RetrievalError) {
      return { ok: false, error: { category: err.category, message: err.message } };
    }
    return {
      ok: false,
      error: { category: 'internal', message: err instanceof Error ? err.message : String(err) },
    };
  }
}

/* ------------------------------------------------------------------ *
 * Legacy RetrievalConnector wrapper — registers `sec-edgar` with the
 * dispatcher so JobRequest.sources can include it. fetch() resolves
 * CIK from the entity then fetches company facts (uncompressed,
 * unfiltered) so the old dispatch path still works for any caller that
 * uses it. Modern agents prefer the four-function surface above.
 * ------------------------------------------------------------------ */

export class SecEdgarConnector implements RetrievalConnector {
  readonly name = 'sec-edgar';
  readonly authRequired = false;
  readonly rateLimit = { requestsPerSecond: 10, burstSize: 10 };

  async isAvailable(): Promise<boolean> {
    return true;
  }

  async fetch(params: FetchParams): Promise<RawPayload> {
    const candidates = [params.entity.id, ...params.entity.aliases];
    const directCik = candidates.map(parseCik).find((c): c is string => !!c);
    let cik: string | undefined = directCik;
    if (!cik) {
      const search = candidates.filter(c => c && c.trim().length > 0).join(', ');
      const matches = await secEdgarCompanies(search);
      cik = matches[0]?.cik;
    }
    if (!cik) {
      throw new RetrievalError(
        'no-content',
        `unable to resolve CIK for entity "${params.entity.id}" (aliases: ${params.entity.aliases.join(', ') || '<none>'})`,
        { entity: params.entity.id, aliases: params.entity.aliases },
      );
    }
    const facts = await secFinancials(cik);
    return {
      source: this.name,
      sourceUrl: facts.sourceUrl,
      capturedAt: facts.capturedAt,
      contentType: 'application/json',
      rawContent: JSON.stringify(facts),
      metadata: {
        cik,
        entity: params.entity.id,
        period: params.period,
        endpoint: 'companyfacts',
      },
    };
  }
}

/* ------------------------------------------------------------------ *
 * Helpers
 * ------------------------------------------------------------------ */

export function parseCik(value: string): string | null {
  if (!value) return null;
  const trimmed = value.replace(/^CIK/i, '').trim();
  if (!/^\d{1,10}$/.test(trimmed)) return null;
  return trimmed.padStart(10, '0');
}

export function translateHttpError(err: unknown, url: string): RetrievalError {
  if (err instanceof HttpError) {
    switch (err.category) {
      case 'timeout':
      case 'network':
      case 'aborted':
        return new RetrievalError('unavailable', err.message, { url });
      case 'body-too-large':
        return new RetrievalError('internal', err.message, { url });
      case 'status': {
        if (err.status === 404) return new RetrievalError('no-content', err.message, { url });
        if (err.status === 401 || err.status === 403) {
          return new RetrievalError('auth-failed', err.message, { url });
        }
        if (err.status === 429) return new RetrievalError('rate-limited', err.message, { url });
        if (err.status === 400) return new RetrievalError('invalid-request', err.message, { url });
        return new RetrievalError('internal', err.message, { url });
      }
    }
  }
  return new RetrievalError('internal', err instanceof Error ? err.message : String(err), { url });
}