BID · Console
Baseline · Intelligence · Decision
src/tools/retrieval/connectors/sec-edgar-filings.ts 21,706 bytes · typescript
/**
 * SEC EDGAR filings connector — unstructured retrieval surface.
 *
 * Three pure-retrieval functions for working with the full text of
 * SEC filings (10-K, 10-Q, 8-K, S-1, …). None of them needs an LLM:
 *
 *   1. secFilingIndex(cik, accessionNumber)
 *        Returns every file inside a filing (primary document plus
 *        all exhibits / data files), with type, size, and URL.
 *
 *   2. secFilingSections(documentUrl, items?)
 *        Downloads a 10-K / 10-Q (or any HTML filing body), strips
 *        HTML to plain text, and splits the text into the standard
 *        SEC Items (Item 1, 1A, 7, 7A, 8, …) using a deterministic
 *        regex — no model in the loop. Pass `items` to narrow the
 *        response to a specific subset (e.g. "7,7A,1A").
 *
 *   3. secFullTextSearch(query, opts?)
 *        Hits the EDGAR full-text search API (efts.sec.gov) to find
 *        filings whose body contains a phrase. Optionally filter by
 *        forms, CIK, or date range.
 *
 * All three share the User-Agent and 10 req/s rate limiter exported
 * by sec-edgar.ts so there is no risk of SEC seeing a doubled
 * outbound rate. Every failure is translated to RetrievalError so the
 * dispatcher / agent never receives a raw exception (Std 12).
 *
 * The Anthropic tool descriptors at the bottom (SEC_FILING_TOOLS,
 * executeSecFilingTool) match the shape used by the existing SEC_TOOLS
 * in sec-edgar.ts so agents can expose all seven SEC tools in one
 * combined array.
 */

import {
  RetrievalError,
  type FetchParams as _FetchParams,
} from '../interface.js';
import { httpGet, type HttpResponse } from '../http-client.js';
import {
  SEC_USER_AGENT,
  SEC_HEADERS_JSON,
  SEC_HEADERS_ANY,
  secLimiter,
  parseCik,
  stripHtml,
  translateHttpError,
} from './sec-edgar.js';

/* Touch the unused import so tsc with verbatimModuleSyntax stays
 * happy. Removed if `FetchParams` ever lands in this file. */
type _Unused = _FetchParams;

const FILING_INDEX_MAX_BYTES = 4 * 1024 * 1024;
const FULL_TEXT_SEARCH_MAX_BYTES = 8 * 1024 * 1024;
const FILING_BODY_MAX_BYTES = 64 * 1024 * 1024;

const DEFAULT_SECTION_MIN_CHARS = 500;

const FULL_TEXT_SEARCH_URL = 'https://efts.sec.gov/LATEST/search-index';

/* ------------------------------------------------------------------ *
 * Tool 5 — secFilingIndex(cik, accessionNumber)
 *
 * Returns every file inside a filing. SEC publishes a JSON manifest
 * at /Archives/edgar/data/{cikInt}/{accessionNoDash}/index.json that
 * lists each document (name, type, size, last-modified). The function
 * normalises the accession number (with or without dashes), builds the
 * canonical URL, and returns an array of FilingFile records ready for
 * downstream retrieval via secFilingDocument or secFilingSections.
 * ------------------------------------------------------------------ */

export interface SecFilingFile {
  readonly name: string;
  readonly type: string;
  readonly sizeBytes: number | null;
  readonly lastModified: string;
  readonly url: string;
}

export interface SecFilingIndex {
  readonly cik: string;
  readonly accessionNumber: string;
  readonly directoryName: string;
  readonly indexUrl: string;
  readonly files: readonly SecFilingFile[];
  readonly capturedAt: string;
}

export async function secFilingIndex(
  cik: string,
  accessionNumber: string,
): Promise<SecFilingIndex> {
  const padded = parseCik(cik);
  if (!padded) {
    throw new RetrievalError('invalid-request', `secFilingIndex: not a valid CIK: "${cik}"`);
  }
  if (typeof accessionNumber !== 'string' || !accessionNumber.trim()) {
    throw new RetrievalError(
      'invalid-request',
      'secFilingIndex: accessionNumber is required (e.g. "0000019617-25-000132").',
    );
  }
  const dashed = normaliseAccession(accessionNumber);
  if (!dashed) {
    throw new RetrievalError(
      'invalid-request',
      `secFilingIndex: accessionNumber "${accessionNumber}" does not look like a valid SEC accession.`,
    );
  }
  const noDash = dashed.replace(/-/g, '');
  const cikInt = parseInt(padded, 10);
  const dirBase = `https://www.sec.gov/Archives/edgar/data/${cikInt}/${noDash}`;
  const indexUrl = `${dirBase}/index.json`;

  await secLimiter.acquire();
  let res;
  try {
    res = await httpGet(indexUrl, { headers: SEC_HEADERS_JSON, maxBodyBytes: FILING_INDEX_MAX_BYTES });
  } catch (err) {
    throw translateHttpError(err, indexUrl);
  }
  let parsed: { directory?: { name?: unknown; item?: unknown } };
  try {
    parsed = JSON.parse(res.body);
  } catch (err) {
    throw new RetrievalError(
      'internal',
      `filing index JSON parse failed: ${err instanceof Error ? err.message : String(err)}`,
      { url: indexUrl },
    );
  }
  const directoryName = typeof parsed.directory?.name === 'string' ? parsed.directory.name : '';
  const rawItems = Array.isArray(parsed.directory?.item) ? parsed.directory.item : [];
  const files: SecFilingFile[] = [];
  for (const raw of rawItems) {
    if (!raw || typeof raw !== 'object') continue;
    const e = raw as { name?: unknown; type?: unknown; size?: unknown; 'last-modified'?: unknown };
    const name = typeof e.name === 'string' ? e.name : '';
    if (!name) continue;
    const type = typeof e.type === 'string' ? e.type : '';
    const sizeBytes =
      typeof e.size === 'number'
        ? e.size
        : typeof e.size === 'string' && /^\d+$/.test(e.size)
          ? parseInt(e.size, 10)
          : null;
    const lastModified = typeof e['last-modified'] === 'string' ? e['last-modified'] : '';
    files.push({
      name,
      type,
      sizeBytes,
      lastModified,
      url: `${dirBase}/${name}`,
    });
  }
  return {
    cik: padded,
    accessionNumber: dashed,
    directoryName,
    indexUrl,
    files,
    capturedAt: new Date().toISOString(),
  };
}

function normaliseAccession(raw: string): string | null {
  const trimmed = raw.trim();
  if (/^\d{10}-\d{2}-\d{6}$/.test(trimmed)) return trimmed;
  if (/^\d{18}$/.test(trimmed)) {
    return `${trimmed.slice(0, 10)}-${trimmed.slice(10, 12)}-${trimmed.slice(12)}`;
  }
  return null;
}

/* ------------------------------------------------------------------ *
 * Tool 6 — secFilingSections(documentUrl, items?)
 *
 * Downloads a filing body, strips HTML, and splits the plain text into
 * the standard SEC Items (e.g. Item 1, 1A, 7, 7A). The split is
 * deterministic: regex-driven on the canonical "Item N." / "ITEM NA."
 * headings. No LLM is involved.
 *
 * Returns one entry per detected item with the full slice between
 * that item's heading and the next. The first occurrence of each
 * item is typically the TOC entry and is filtered out by a minimum-
 * length heuristic (sections shorter than `minSectionChars` are
 * dropped; default 500). Pass `items="7,7A"` to narrow to a subset.
 * ------------------------------------------------------------------ */

export interface SecFilingSection {
  readonly item: string;
  readonly heading: string;
  readonly startOffset: number;
  readonly text: string;
  readonly charCount: number;
}

export interface SecFilingSections {
  readonly url: string;
  readonly contentType: string;
  readonly fullCharCount: number;
  readonly itemsFilter?: readonly string[];
  readonly sections: readonly SecFilingSection[];
  readonly capturedAt: string;
}

export interface SecFilingSectionsOptions {
  readonly items?: string;
  readonly minSectionChars?: number;
}

export async function secFilingSections(
  documentUrl: string,
  opts: SecFilingSectionsOptions = {},
): Promise<SecFilingSections> {
  if (typeof documentUrl !== 'string' || !documentUrl) {
    throw new RetrievalError('invalid-request', 'secFilingSections: documentUrl is required.');
  }
  let parsedUrl: URL;
  try {
    parsedUrl = new URL(documentUrl);
  } catch {
    throw new RetrievalError('invalid-request', `secFilingSections: invalid URL "${documentUrl}".`);
  }
  if (!/(^|\.)sec\.gov$/i.test(parsedUrl.hostname)) {
    throw new RetrievalError(
      'invalid-request',
      `secFilingSections: refusing non-SEC host "${parsedUrl.hostname}". Only sec.gov hosts are allowed.`,
    );
  }
  await secLimiter.acquire();
  let res: HttpResponse;
  try {
    res = await httpGet(documentUrl, {
      headers: { 'User-Agent': SEC_USER_AGENT, Accept: '*/*' },
      maxBodyBytes: FILING_BODY_MAX_BYTES,
    });
  } catch (err) {
    throw translateHttpError(err, documentUrl);
  }
  const text = stripHtml(res.body);
  const min = Math.max(1, opts.minSectionChars ?? DEFAULT_SECTION_MIN_CHARS);
  const wantItems = parseItemFilter(opts.items);

  const sections = splitIntoItems(text, min);
  const filtered =
    wantItems.length === 0
      ? sections
      : sections.filter(s => wantItems.includes(s.item.toUpperCase()));

  return {
    url: res.url,
    contentType: res.contentType,
    fullCharCount: text.length,
    itemsFilter: wantItems.length > 0 ? wantItems : undefined,
    sections: filtered,
    capturedAt: new Date().toISOString(),
  };
}

function parseItemFilter(items?: string): string[] {
  if (typeof items !== 'string' || !items.trim()) return [];
  return items
    .split(',')
    .map(s => s.trim().toUpperCase())
    .filter(s => /^\d+[A-Z]?$/.test(s));
}

/**
 * Find every "Item N[A]" boundary in `text` and slice between them.
 * Filters out boundaries whose slice is shorter than `minChars` —
 * those are almost always table-of-contents entries.
 *
 * Pure function, no I/O, no model.
 */
function splitIntoItems(text: string, minChars: number): SecFilingSection[] {
  /* Match "Item 1.", "ITEM 1A.", "Item 7 -", optionally preceded by a
   * newline. The heading often spills into a section title on the
   * same line, which we capture too. */
  const headingRe = /(?:^|\n)\s*(item)\s+(\d{1,2}[A-Z]?)\s*[\.\:\-–—]?\s*([^\n]{0,200})/gi;
  const matches: { item: string; heading: string; index: number }[] = [];
  let m: RegExpExecArray | null;
  while ((m = headingRe.exec(text)) !== null) {
    const item = m[2]!.toUpperCase();
    const tail = (m[3] ?? '').trim();
    const heading = `Item ${item}${tail ? ` — ${tail.replace(/\s+/g, ' ').slice(0, 120)}` : ''}`;
    matches.push({ item, heading, index: m.index });
  }
  if (matches.length === 0) return [];
  matches.sort((a, b) => a.index - b.index);

  const sections: SecFilingSection[] = [];
  for (let i = 0; i < matches.length; i++) {
    const cur = matches[i]!;
    const next = matches[i + 1];
    const start = cur.index;
    const end = next ? next.index : text.length;
    const slice = text.slice(start, end).trim();
    if (slice.length < minChars) continue;
    sections.push({
      item: cur.item,
      heading: cur.heading,
      startOffset: start,
      text: slice,
      charCount: slice.length,
    });
  }

  /* If the same item appears multiple times (TOC + body), keep only
   * the longest occurrence — the body section. */
  const byItem = new Map<string, SecFilingSection>();
  for (const s of sections) {
    const prev = byItem.get(s.item);
    if (!prev || s.charCount > prev.charCount) byItem.set(s.item, s);
  }
  return [...byItem.values()].sort((a, b) => a.startOffset - b.startOffset);
}

/* ------------------------------------------------------------------ *
 * Tool 7 — secFullTextSearch(query, opts?)
 *
 * Calls https://efts.sec.gov/LATEST/search-index — the public EDGAR
 * full-text search backing efts.sec.gov/search — to find filings
 * matching a phrase. Useful for "find the 10-K that mentions X" or
 * "8-Ks announcing Y from any issuer". Returns hits with accession,
 * filed date, form, and CIK.
 * ------------------------------------------------------------------ */

export interface SecSearchHit {
  readonly accessionNumber: string;
  readonly cik: string;
  readonly entityName: string;
  readonly form: string;
  readonly filedAt: string;
  readonly fileName: string;
  readonly score: number;
  readonly snippet: string;
  readonly url: string;
}

export interface SecSearchResult {
  readonly query: string;
  readonly forms?: readonly string[];
  readonly cik?: string;
  readonly dateFrom?: string;
  readonly dateTo?: string;
  readonly totalHits: number;
  readonly hits: readonly SecSearchHit[];
  readonly sourceUrl: string;
  readonly capturedAt: string;
}

export interface SecFullTextSearchOptions {
  readonly forms?: string;
  readonly cik?: string;
  readonly dateFrom?: string;
  readonly dateTo?: string;
  readonly maxHits?: number;
}

export async function secFullTextSearch(
  query: string,
  opts: SecFullTextSearchOptions = {},
): Promise<SecSearchResult> {
  if (typeof query !== 'string' || !query.trim()) {
    throw new RetrievalError('invalid-request', 'secFullTextSearch: query is required.');
  }
  const formList = (opts.forms ?? '')
    .split(',')
    .map(s => s.trim())
    .filter(s => s.length > 0);

  const params = new URLSearchParams();
  params.set('q', query.trim());
  if (formList.length > 0) params.set('forms', formList.join(','));
  if (opts.cik) {
    const padded = parseCik(opts.cik);
    if (padded) params.set('ciks', padded);
  }
  if (opts.dateFrom || opts.dateTo) {
    params.set('dateRange', 'custom');
    if (opts.dateFrom) params.set('startdt', opts.dateFrom);
    if (opts.dateTo) params.set('enddt', opts.dateTo);
  }
  const url = `${FULL_TEXT_SEARCH_URL}?${params.toString()}`;

  await secLimiter.acquire();
  let res;
  try {
    res = await httpGet(url, {
      headers: { 'User-Agent': SEC_USER_AGENT, Accept: 'application/json' },
      maxBodyBytes: FULL_TEXT_SEARCH_MAX_BYTES,
    });
  } catch (err) {
    throw translateHttpError(err, url);
  }
  let parsed: { hits?: { total?: { value?: unknown }; hits?: unknown[] } };
  try {
    parsed = JSON.parse(res.body);
  } catch (err) {
    throw new RetrievalError(
      'internal',
      `EDGAR full-text JSON parse failed: ${err instanceof Error ? err.message : String(err)}`,
      { url },
    );
  }

  const rawHits = Array.isArray(parsed.hits?.hits) ? parsed.hits.hits : [];
  const maxHits = Math.max(1, opts.maxHits ?? 50);
  const hits: SecSearchHit[] = [];
  for (const rh of rawHits.slice(0, maxHits)) {
    if (!rh || typeof rh !== 'object') continue;
    const h = rh as { _id?: unknown; _score?: unknown; _source?: unknown };
    const id = typeof h._id === 'string' ? h._id : '';
    const score = typeof h._score === 'number' ? h._score : 0;
    const src = (h._source && typeof h._source === 'object' ? h._source : {}) as {
      adsh?: unknown;
      ciks?: unknown;
      display_names?: unknown;
      form?: unknown;
      file_date?: unknown;
      file_type?: unknown;
      teaser?: unknown;
    };
    const accession = typeof src.adsh === 'string' ? src.adsh : '';
    const ciks = Array.isArray(src.ciks) ? src.ciks.filter((x): x is string => typeof x === 'string') : [];
    const cik = ciks[0] ? (parseCik(ciks[0]) ?? ciks[0]) : '';
    const names = Array.isArray(src.display_names)
      ? src.display_names.filter((x): x is string => typeof x === 'string')
      : [];
    const entityName = names[0] ?? '';
    const form = typeof src.form === 'string' ? src.form : '';
    const filedAt = typeof src.file_date === 'string' ? src.file_date : '';
    const fileName = typeof src.file_type === 'string' ? src.file_type : '';
    const snippet = typeof src.teaser === 'string' ? src.teaser : '';
    /* id is shaped "<accNoDash>:<fileName>" — derive the document URL. */
    const accNoDash = accession.replace(/-/g, '');
    const cikInt = cik ? parseInt(cik, 10) : 0;
    const docFile = id.includes(':') ? id.split(':').slice(1).join(':') : '';
    const docUrl =
      cikInt > 0 && accNoDash && docFile
        ? `https://www.sec.gov/Archives/edgar/data/${cikInt}/${accNoDash}/${docFile}`
        : '';

    hits.push({
      accessionNumber: accession,
      cik,
      entityName,
      form,
      filedAt,
      fileName,
      score,
      snippet,
      url: docUrl,
    });
  }

  const totalRaw = parsed.hits?.total?.value;
  const totalHits = typeof totalRaw === 'number' ? totalRaw : hits.length;

  return {
    query: query.trim(),
    forms: formList.length > 0 ? formList : undefined,
    cik: opts.cik ? (parseCik(opts.cik) ?? opts.cik) : undefined,
    dateFrom: opts.dateFrom,
    dateTo: opts.dateTo,
    totalHits,
    hits,
    sourceUrl: url,
    capturedAt: new Date().toISOString(),
  };
}

/* ------------------------------------------------------------------ *
 * Anthropic tool descriptors. Combine with SEC_TOOLS from sec-edgar.ts
 * so agents can register all seven tools in one array.
 * ------------------------------------------------------------------ */

export interface SecFilingToolDescriptor {
  readonly name: string;
  readonly description: string;
  readonly input_schema: {
    readonly type: 'object';
    readonly properties: Record<string, { type: string; description: string }>;
    readonly required: readonly string[];
  };
}

export const SEC_FILING_TOOLS: readonly SecFilingToolDescriptor[] = [
  {
    name: 'sec_filing_index',
    description:
      'List every file inside a single SEC filing (10-K, 10-Q, 8-K, S-1, …). Returns ' +
      '{name, type, sizeBytes, lastModified, url} for each document and exhibit. Use the ' +
      'accession number from sec_submissions to ground the call.',
    input_schema: {
      type: 'object',
      properties: {
        cik: { type: 'string', description: 'CIK of the filer (any numeric form; will be padded).' },
        accessionNumber: {
          type: 'string',
          description: 'Filing accession number with or without dashes (e.g. "0000019617-25-000132").',
        },
      },
      required: ['cik', 'accessionNumber'],
    },
  },
  {
    name: 'sec_filing_sections',
    description:
      'Download an SEC filing body (10-K / 10-Q HTML) and split it into the standard Items ' +
      '(Item 1 Business, Item 1A Risk Factors, Item 7 MD&A, Item 7A QQDMR, Item 8 Financial ' +
      'Statements, etc.). Optionally filter to a subset of items via a comma-separated list ' +
      '(e.g. "7,7A,1A"). The split is regex-based — no LLM. Returns one entry per item with ' +
      'the full text slice; only sec.gov URLs are accepted.',
    input_schema: {
      type: 'object',
      properties: {
        documentUrl: {
          type: 'string',
          description: 'A primaryDocumentUrl from sec_submissions (must be a sec.gov URL).',
        },
        items: {
          type: 'string',
          description: 'Optional comma-separated Items to narrow to (e.g. "7,7A"). Omit for all detected items.',
        },
      },
      required: ['documentUrl'],
    },
  },
  {
    name: 'sec_full_text_search',
    description:
      "Search EDGAR's full-text index for filings whose body contains a phrase. Optionally " +
      'filter by forms (comma-separated), a specific CIK, and a date range (YYYY-MM-DD). ' +
      'Returns hits with accession, CIK, form, filed date, snippet, and document URL.',
    input_schema: {
      type: 'object',
      properties: {
        query: { type: 'string', description: 'Search phrase (quoted phrases supported via SEC syntax).' },
        forms: { type: 'string', description: 'Optional comma-separated forms (e.g. "10-K,10-Q,8-K").' },
        cik: { type: 'string', description: 'Optional CIK to restrict to one filer.' },
        dateFrom: { type: 'string', description: 'Optional start date (YYYY-MM-DD).' },
        dateTo: { type: 'string', description: 'Optional end date (YYYY-MM-DD).' },
      },
      required: ['query'],
    },
  },
];

export interface SecFilingToolResult {
  readonly ok: boolean;
  readonly result?: unknown;
  readonly error?: { readonly category: string; readonly message: string };
}

export async function executeSecFilingTool(name: string, rawInput: unknown): Promise<SecFilingToolResult> {
  const input = (rawInput && typeof rawInput === 'object') ? (rawInput as Record<string, unknown>) : {};
  try {
    switch (name) {
      case 'sec_filing_index': {
        const cik = typeof input.cik === 'string' ? input.cik : '';
        const acc = typeof input.accessionNumber === 'string' ? input.accessionNumber : '';
        return { ok: true, result: await secFilingIndex(cik, acc) };
      }
      case 'sec_filing_sections': {
        const url = typeof input.documentUrl === 'string' ? input.documentUrl : '';
        const items = typeof input.items === 'string' ? input.items : undefined;
        return { ok: true, result: await secFilingSections(url, { items }) };
      }
      case 'sec_full_text_search': {
        const query = typeof input.query === 'string' ? input.query : '';
        const forms = typeof input.forms === 'string' ? input.forms : undefined;
        const cik = typeof input.cik === 'string' ? input.cik : undefined;
        const dateFrom = typeof input.dateFrom === 'string' ? input.dateFrom : undefined;
        const dateTo = typeof input.dateTo === 'string' ? input.dateTo : undefined;
        return { ok: true, result: await secFullTextSearch(query, { forms, cik, dateFrom, dateTo }) };
      }
      default:
        return { ok: false, error: { category: 'unknown-tool', message: `unknown SEC filing tool "${name}"` } };
    }
  } catch (err) {
    if (err instanceof RetrievalError) {
      return { ok: false, error: { category: err.category, message: err.message } };
    }
    return {
      ok: false,
      error: { category: 'internal', message: err instanceof Error ? err.message : String(err) },
    };
  }
}

/* Re-export the User-Agent so callers that build their own requests
 * against sec.gov stay aligned with the project's declared identity. */
export { SEC_USER_AGENT, SEC_HEADERS_JSON, SEC_HEADERS_ANY };