/**
* SEC EDGAR filings connector — unstructured retrieval surface.
*
* Three pure-retrieval functions for working with the full text of
* SEC filings (10-K, 10-Q, 8-K, S-1, …). None of them needs an LLM:
*
* 1. secFilingIndex(cik, accessionNumber)
* Returns every file inside a filing (primary document plus
* all exhibits / data files), with type, size, and URL.
*
* 2. secFilingSections(documentUrl, items?)
* Downloads a 10-K / 10-Q (or any HTML filing body), strips
* HTML to plain text, and splits the text into the standard
* SEC Items (Item 1, 1A, 7, 7A, 8, …) using a deterministic
* regex — no model in the loop. Pass `items` to narrow the
* response to a specific subset (e.g. "7,7A,1A").
*
* 3. secFullTextSearch(query, opts?)
* Hits the EDGAR full-text search API (efts.sec.gov) to find
* filings whose body contains a phrase. Optionally filter by
* forms, CIK, or date range.
*
* All three share the User-Agent and 10 req/s rate limiter exported
* by sec-edgar.ts so there is no risk of SEC seeing a doubled
* outbound rate. Every failure is translated to RetrievalError so the
* dispatcher / agent never receives a raw exception (Std 12).
*
* The Anthropic tool descriptors at the bottom (SEC_FILING_TOOLS,
* executeSecFilingTool) match the shape used by the existing SEC_TOOLS
* in sec-edgar.ts so agents can expose all seven SEC tools in one
* combined array.
*/
import {
RetrievalError,
type FetchParams as _FetchParams,
} from '../interface.js';
import { httpGet, type HttpResponse } from '../http-client.js';
import {
SEC_USER_AGENT,
SEC_HEADERS_JSON,
SEC_HEADERS_ANY,
secLimiter,
parseCik,
stripHtml,
translateHttpError,
} from './sec-edgar.js';
/* Touch the unused import so tsc with verbatimModuleSyntax stays
* happy. Removed if `FetchParams` ever lands in this file. */
type _Unused = _FetchParams;
const FILING_INDEX_MAX_BYTES = 4 * 1024 * 1024;
const FULL_TEXT_SEARCH_MAX_BYTES = 8 * 1024 * 1024;
const FILING_BODY_MAX_BYTES = 64 * 1024 * 1024;
const DEFAULT_SECTION_MIN_CHARS = 500;
const FULL_TEXT_SEARCH_URL = 'https://efts.sec.gov/LATEST/search-index';
/* ------------------------------------------------------------------ *
* Tool 5 — secFilingIndex(cik, accessionNumber)
*
* Returns every file inside a filing. SEC publishes a JSON manifest
* at /Archives/edgar/data/{cikInt}/{accessionNoDash}/index.json that
* lists each document (name, type, size, last-modified). The function
* normalises the accession number (with or without dashes), builds the
* canonical URL, and returns an array of FilingFile records ready for
* downstream retrieval via secFilingDocument or secFilingSections.
* ------------------------------------------------------------------ */
export interface SecFilingFile {
readonly name: string;
readonly type: string;
readonly sizeBytes: number | null;
readonly lastModified: string;
readonly url: string;
}
export interface SecFilingIndex {
readonly cik: string;
readonly accessionNumber: string;
readonly directoryName: string;
readonly indexUrl: string;
readonly files: readonly SecFilingFile[];
readonly capturedAt: string;
}
export async function secFilingIndex(
cik: string,
accessionNumber: string,
): Promise<SecFilingIndex> {
const padded = parseCik(cik);
if (!padded) {
throw new RetrievalError('invalid-request', `secFilingIndex: not a valid CIK: "${cik}"`);
}
if (typeof accessionNumber !== 'string' || !accessionNumber.trim()) {
throw new RetrievalError(
'invalid-request',
'secFilingIndex: accessionNumber is required (e.g. "0000019617-25-000132").',
);
}
const dashed = normaliseAccession(accessionNumber);
if (!dashed) {
throw new RetrievalError(
'invalid-request',
`secFilingIndex: accessionNumber "${accessionNumber}" does not look like a valid SEC accession.`,
);
}
const noDash = dashed.replace(/-/g, '');
const cikInt = parseInt(padded, 10);
const dirBase = `https://www.sec.gov/Archives/edgar/data/${cikInt}/${noDash}`;
const indexUrl = `${dirBase}/index.json`;
await secLimiter.acquire();
let res;
try {
res = await httpGet(indexUrl, { headers: SEC_HEADERS_JSON, maxBodyBytes: FILING_INDEX_MAX_BYTES });
} catch (err) {
throw translateHttpError(err, indexUrl);
}
let parsed: { directory?: { name?: unknown; item?: unknown } };
try {
parsed = JSON.parse(res.body);
} catch (err) {
throw new RetrievalError(
'internal',
`filing index JSON parse failed: ${err instanceof Error ? err.message : String(err)}`,
{ url: indexUrl },
);
}
const directoryName = typeof parsed.directory?.name === 'string' ? parsed.directory.name : '';
const rawItems = Array.isArray(parsed.directory?.item) ? parsed.directory.item : [];
const files: SecFilingFile[] = [];
for (const raw of rawItems) {
if (!raw || typeof raw !== 'object') continue;
const e = raw as { name?: unknown; type?: unknown; size?: unknown; 'last-modified'?: unknown };
const name = typeof e.name === 'string' ? e.name : '';
if (!name) continue;
const type = typeof e.type === 'string' ? e.type : '';
const sizeBytes =
typeof e.size === 'number'
? e.size
: typeof e.size === 'string' && /^\d+$/.test(e.size)
? parseInt(e.size, 10)
: null;
const lastModified = typeof e['last-modified'] === 'string' ? e['last-modified'] : '';
files.push({
name,
type,
sizeBytes,
lastModified,
url: `${dirBase}/${name}`,
});
}
return {
cik: padded,
accessionNumber: dashed,
directoryName,
indexUrl,
files,
capturedAt: new Date().toISOString(),
};
}
function normaliseAccession(raw: string): string | null {
const trimmed = raw.trim();
if (/^\d{10}-\d{2}-\d{6}$/.test(trimmed)) return trimmed;
if (/^\d{18}$/.test(trimmed)) {
return `${trimmed.slice(0, 10)}-${trimmed.slice(10, 12)}-${trimmed.slice(12)}`;
}
return null;
}
/* ------------------------------------------------------------------ *
* Tool 6 — secFilingSections(documentUrl, items?)
*
* Downloads a filing body, strips HTML, and splits the plain text into
* the standard SEC Items (e.g. Item 1, 1A, 7, 7A). The split is
* deterministic: regex-driven on the canonical "Item N." / "ITEM NA."
* headings. No LLM is involved.
*
* Returns one entry per detected item with the full slice between
* that item's heading and the next. The first occurrence of each
* item is typically the TOC entry and is filtered out by a minimum-
* length heuristic (sections shorter than `minSectionChars` are
* dropped; default 500). Pass `items="7,7A"` to narrow to a subset.
* ------------------------------------------------------------------ */
export interface SecFilingSection {
readonly item: string;
readonly heading: string;
readonly startOffset: number;
readonly text: string;
readonly charCount: number;
}
export interface SecFilingSections {
readonly url: string;
readonly contentType: string;
readonly fullCharCount: number;
readonly itemsFilter?: readonly string[];
readonly sections: readonly SecFilingSection[];
readonly capturedAt: string;
}
export interface SecFilingSectionsOptions {
readonly items?: string;
readonly minSectionChars?: number;
}
export async function secFilingSections(
documentUrl: string,
opts: SecFilingSectionsOptions = {},
): Promise<SecFilingSections> {
if (typeof documentUrl !== 'string' || !documentUrl) {
throw new RetrievalError('invalid-request', 'secFilingSections: documentUrl is required.');
}
let parsedUrl: URL;
try {
parsedUrl = new URL(documentUrl);
} catch {
throw new RetrievalError('invalid-request', `secFilingSections: invalid URL "${documentUrl}".`);
}
if (!/(^|\.)sec\.gov$/i.test(parsedUrl.hostname)) {
throw new RetrievalError(
'invalid-request',
`secFilingSections: refusing non-SEC host "${parsedUrl.hostname}". Only sec.gov hosts are allowed.`,
);
}
await secLimiter.acquire();
let res: HttpResponse;
try {
res = await httpGet(documentUrl, {
headers: { 'User-Agent': SEC_USER_AGENT, Accept: '*/*' },
maxBodyBytes: FILING_BODY_MAX_BYTES,
});
} catch (err) {
throw translateHttpError(err, documentUrl);
}
const text = stripHtml(res.body);
const min = Math.max(1, opts.minSectionChars ?? DEFAULT_SECTION_MIN_CHARS);
const wantItems = parseItemFilter(opts.items);
const sections = splitIntoItems(text, min);
const filtered =
wantItems.length === 0
? sections
: sections.filter(s => wantItems.includes(s.item.toUpperCase()));
return {
url: res.url,
contentType: res.contentType,
fullCharCount: text.length,
itemsFilter: wantItems.length > 0 ? wantItems : undefined,
sections: filtered,
capturedAt: new Date().toISOString(),
};
}
function parseItemFilter(items?: string): string[] {
if (typeof items !== 'string' || !items.trim()) return [];
return items
.split(',')
.map(s => s.trim().toUpperCase())
.filter(s => /^\d+[A-Z]?$/.test(s));
}
/**
* Find every "Item N[A]" boundary in `text` and slice between them.
* Filters out boundaries whose slice is shorter than `minChars` —
* those are almost always table-of-contents entries.
*
* Pure function, no I/O, no model.
*/
function splitIntoItems(text: string, minChars: number): SecFilingSection[] {
/* Match "Item 1.", "ITEM 1A.", "Item 7 -", optionally preceded by a
* newline. The heading often spills into a section title on the
* same line, which we capture too. */
const headingRe = /(?:^|\n)\s*(item)\s+(\d{1,2}[A-Z]?)\s*[\.\:\-–—]?\s*([^\n]{0,200})/gi;
const matches: { item: string; heading: string; index: number }[] = [];
let m: RegExpExecArray | null;
while ((m = headingRe.exec(text)) !== null) {
const item = m[2]!.toUpperCase();
const tail = (m[3] ?? '').trim();
const heading = `Item ${item}${tail ? ` — ${tail.replace(/\s+/g, ' ').slice(0, 120)}` : ''}`;
matches.push({ item, heading, index: m.index });
}
if (matches.length === 0) return [];
matches.sort((a, b) => a.index - b.index);
const sections: SecFilingSection[] = [];
for (let i = 0; i < matches.length; i++) {
const cur = matches[i]!;
const next = matches[i + 1];
const start = cur.index;
const end = next ? next.index : text.length;
const slice = text.slice(start, end).trim();
if (slice.length < minChars) continue;
sections.push({
item: cur.item,
heading: cur.heading,
startOffset: start,
text: slice,
charCount: slice.length,
});
}
/* If the same item appears multiple times (TOC + body), keep only
* the longest occurrence — the body section. */
const byItem = new Map<string, SecFilingSection>();
for (const s of sections) {
const prev = byItem.get(s.item);
if (!prev || s.charCount > prev.charCount) byItem.set(s.item, s);
}
return [...byItem.values()].sort((a, b) => a.startOffset - b.startOffset);
}
/* ------------------------------------------------------------------ *
* Tool 7 — secFullTextSearch(query, opts?)
*
* Calls https://efts.sec.gov/LATEST/search-index — the public EDGAR
* full-text search backing efts.sec.gov/search — to find filings
* matching a phrase. Useful for "find the 10-K that mentions X" or
* "8-Ks announcing Y from any issuer". Returns hits with accession,
* filed date, form, and CIK.
* ------------------------------------------------------------------ */
export interface SecSearchHit {
readonly accessionNumber: string;
readonly cik: string;
readonly entityName: string;
readonly form: string;
readonly filedAt: string;
readonly fileName: string;
readonly score: number;
readonly snippet: string;
readonly url: string;
}
export interface SecSearchResult {
readonly query: string;
readonly forms?: readonly string[];
readonly cik?: string;
readonly dateFrom?: string;
readonly dateTo?: string;
readonly totalHits: number;
readonly hits: readonly SecSearchHit[];
readonly sourceUrl: string;
readonly capturedAt: string;
}
export interface SecFullTextSearchOptions {
readonly forms?: string;
readonly cik?: string;
readonly dateFrom?: string;
readonly dateTo?: string;
readonly maxHits?: number;
}
export async function secFullTextSearch(
query: string,
opts: SecFullTextSearchOptions = {},
): Promise<SecSearchResult> {
if (typeof query !== 'string' || !query.trim()) {
throw new RetrievalError('invalid-request', 'secFullTextSearch: query is required.');
}
const formList = (opts.forms ?? '')
.split(',')
.map(s => s.trim())
.filter(s => s.length > 0);
const params = new URLSearchParams();
params.set('q', query.trim());
if (formList.length > 0) params.set('forms', formList.join(','));
if (opts.cik) {
const padded = parseCik(opts.cik);
if (padded) params.set('ciks', padded);
}
if (opts.dateFrom || opts.dateTo) {
params.set('dateRange', 'custom');
if (opts.dateFrom) params.set('startdt', opts.dateFrom);
if (opts.dateTo) params.set('enddt', opts.dateTo);
}
const url = `${FULL_TEXT_SEARCH_URL}?${params.toString()}`;
await secLimiter.acquire();
let res;
try {
res = await httpGet(url, {
headers: { 'User-Agent': SEC_USER_AGENT, Accept: 'application/json' },
maxBodyBytes: FULL_TEXT_SEARCH_MAX_BYTES,
});
} catch (err) {
throw translateHttpError(err, url);
}
let parsed: { hits?: { total?: { value?: unknown }; hits?: unknown[] } };
try {
parsed = JSON.parse(res.body);
} catch (err) {
throw new RetrievalError(
'internal',
`EDGAR full-text JSON parse failed: ${err instanceof Error ? err.message : String(err)}`,
{ url },
);
}
const rawHits = Array.isArray(parsed.hits?.hits) ? parsed.hits.hits : [];
const maxHits = Math.max(1, opts.maxHits ?? 50);
const hits: SecSearchHit[] = [];
for (const rh of rawHits.slice(0, maxHits)) {
if (!rh || typeof rh !== 'object') continue;
const h = rh as { _id?: unknown; _score?: unknown; _source?: unknown };
const id = typeof h._id === 'string' ? h._id : '';
const score = typeof h._score === 'number' ? h._score : 0;
const src = (h._source && typeof h._source === 'object' ? h._source : {}) as {
adsh?: unknown;
ciks?: unknown;
display_names?: unknown;
form?: unknown;
file_date?: unknown;
file_type?: unknown;
teaser?: unknown;
};
const accession = typeof src.adsh === 'string' ? src.adsh : '';
const ciks = Array.isArray(src.ciks) ? src.ciks.filter((x): x is string => typeof x === 'string') : [];
const cik = ciks[0] ? (parseCik(ciks[0]) ?? ciks[0]) : '';
const names = Array.isArray(src.display_names)
? src.display_names.filter((x): x is string => typeof x === 'string')
: [];
const entityName = names[0] ?? '';
const form = typeof src.form === 'string' ? src.form : '';
const filedAt = typeof src.file_date === 'string' ? src.file_date : '';
const fileName = typeof src.file_type === 'string' ? src.file_type : '';
const snippet = typeof src.teaser === 'string' ? src.teaser : '';
/* id is shaped "<accNoDash>:<fileName>" — derive the document URL. */
const accNoDash = accession.replace(/-/g, '');
const cikInt = cik ? parseInt(cik, 10) : 0;
const docFile = id.includes(':') ? id.split(':').slice(1).join(':') : '';
const docUrl =
cikInt > 0 && accNoDash && docFile
? `https://www.sec.gov/Archives/edgar/data/${cikInt}/${accNoDash}/${docFile}`
: '';
hits.push({
accessionNumber: accession,
cik,
entityName,
form,
filedAt,
fileName,
score,
snippet,
url: docUrl,
});
}
const totalRaw = parsed.hits?.total?.value;
const totalHits = typeof totalRaw === 'number' ? totalRaw : hits.length;
return {
query: query.trim(),
forms: formList.length > 0 ? formList : undefined,
cik: opts.cik ? (parseCik(opts.cik) ?? opts.cik) : undefined,
dateFrom: opts.dateFrom,
dateTo: opts.dateTo,
totalHits,
hits,
sourceUrl: url,
capturedAt: new Date().toISOString(),
};
}
/* ------------------------------------------------------------------ *
* Anthropic tool descriptors. Combine with SEC_TOOLS from sec-edgar.ts
* so agents can register all seven tools in one array.
* ------------------------------------------------------------------ */
export interface SecFilingToolDescriptor {
readonly name: string;
readonly description: string;
readonly input_schema: {
readonly type: 'object';
readonly properties: Record<string, { type: string; description: string }>;
readonly required: readonly string[];
};
}
export const SEC_FILING_TOOLS: readonly SecFilingToolDescriptor[] = [
{
name: 'sec_filing_index',
description:
'List every file inside a single SEC filing (10-K, 10-Q, 8-K, S-1, …). Returns ' +
'{name, type, sizeBytes, lastModified, url} for each document and exhibit. Use the ' +
'accession number from sec_submissions to ground the call.',
input_schema: {
type: 'object',
properties: {
cik: { type: 'string', description: 'CIK of the filer (any numeric form; will be padded).' },
accessionNumber: {
type: 'string',
description: 'Filing accession number with or without dashes (e.g. "0000019617-25-000132").',
},
},
required: ['cik', 'accessionNumber'],
},
},
{
name: 'sec_filing_sections',
description:
'Download an SEC filing body (10-K / 10-Q HTML) and split it into the standard Items ' +
'(Item 1 Business, Item 1A Risk Factors, Item 7 MD&A, Item 7A QQDMR, Item 8 Financial ' +
'Statements, etc.). Optionally filter to a subset of items via a comma-separated list ' +
'(e.g. "7,7A,1A"). The split is regex-based — no LLM. Returns one entry per item with ' +
'the full text slice; only sec.gov URLs are accepted.',
input_schema: {
type: 'object',
properties: {
documentUrl: {
type: 'string',
description: 'A primaryDocumentUrl from sec_submissions (must be a sec.gov URL).',
},
items: {
type: 'string',
description: 'Optional comma-separated Items to narrow to (e.g. "7,7A"). Omit for all detected items.',
},
},
required: ['documentUrl'],
},
},
{
name: 'sec_full_text_search',
description:
"Search EDGAR's full-text index for filings whose body contains a phrase. Optionally " +
'filter by forms (comma-separated), a specific CIK, and a date range (YYYY-MM-DD). ' +
'Returns hits with accession, CIK, form, filed date, snippet, and document URL.',
input_schema: {
type: 'object',
properties: {
query: { type: 'string', description: 'Search phrase (quoted phrases supported via SEC syntax).' },
forms: { type: 'string', description: 'Optional comma-separated forms (e.g. "10-K,10-Q,8-K").' },
cik: { type: 'string', description: 'Optional CIK to restrict to one filer.' },
dateFrom: { type: 'string', description: 'Optional start date (YYYY-MM-DD).' },
dateTo: { type: 'string', description: 'Optional end date (YYYY-MM-DD).' },
},
required: ['query'],
},
},
];
export interface SecFilingToolResult {
readonly ok: boolean;
readonly result?: unknown;
readonly error?: { readonly category: string; readonly message: string };
}
export async function executeSecFilingTool(name: string, rawInput: unknown): Promise<SecFilingToolResult> {
const input = (rawInput && typeof rawInput === 'object') ? (rawInput as Record<string, unknown>) : {};
try {
switch (name) {
case 'sec_filing_index': {
const cik = typeof input.cik === 'string' ? input.cik : '';
const acc = typeof input.accessionNumber === 'string' ? input.accessionNumber : '';
return { ok: true, result: await secFilingIndex(cik, acc) };
}
case 'sec_filing_sections': {
const url = typeof input.documentUrl === 'string' ? input.documentUrl : '';
const items = typeof input.items === 'string' ? input.items : undefined;
return { ok: true, result: await secFilingSections(url, { items }) };
}
case 'sec_full_text_search': {
const query = typeof input.query === 'string' ? input.query : '';
const forms = typeof input.forms === 'string' ? input.forms : undefined;
const cik = typeof input.cik === 'string' ? input.cik : undefined;
const dateFrom = typeof input.dateFrom === 'string' ? input.dateFrom : undefined;
const dateTo = typeof input.dateTo === 'string' ? input.dateTo : undefined;
return { ok: true, result: await secFullTextSearch(query, { forms, cik, dateFrom, dateTo }) };
}
default:
return { ok: false, error: { category: 'unknown-tool', message: `unknown SEC filing tool "${name}"` } };
}
} catch (err) {
if (err instanceof RetrievalError) {
return { ok: false, error: { category: err.category, message: err.message } };
}
return {
ok: false,
error: { category: 'internal', message: err instanceof Error ? err.message : String(err) },
};
}
}
/* Re-export the User-Agent so callers that build their own requests
* against sec.gov stay aligned with the project's declared identity. */
export { SEC_USER_AGENT, SEC_HEADERS_JSON, SEC_HEADERS_ANY };