/**
* IRS Business Master File (Exempt Organizations) connector.
*
* Returns rows from the IRS Exempt Organizations Business Master File
* (EO-BMF) — a public CSV dump of all 501(c) organizations, published
* per-state at:
* https://www.irs.gov/pub/irs-soi/eo_{state}.csv
*
* (Plus eo_xx.csv for non-U.S. territories and eo_pr.csv for Puerto
* Rico.) No authentication required.
*
* Inputs:
* - params.entity.id either a 2-letter state code (e.g. "NY",
* "CA") or an EIN. If an EIN is given,
* params.query.state MUST also be set to
* the state the entity files in — there
* is no national EIN index.
* - params.query.state 2-letter state code; required when
* entity.id is an EIN.
* - params.query.ein optional EIN to filter to.
* - params.query.nameContains optional case-insensitive substring
* match on the organization name.
* - params.query.maxRows default 200, hard cap 5000.
*
* The connector downloads the state CSV once per call (the files are
* not paginated upstream), parses rows, and applies the filters in
* memory. Returned payload is JSON, not CSV — already structured.
*/
import {
RetrievalError,
type FetchParams,
type RawPayload,
type RetrievalConnector,
} from '../interface.js';
import { httpGet, HttpError } from '../http-client.js';
import { RateLimiter } from '../rate-limiter.js';
const USER_AGENT = 'MR mitchell.roy@sia-partners.com';
const BMF_BASE = 'https://www.irs.gov/pub/irs-soi';
const BMF_MAX_BYTES = 128 * 1024 * 1024;
const DEFAULT_MAX_ROWS = 200;
const HARD_MAX_ROWS = 5000;
const limiter = new RateLimiter({ requestsPerSecond: 1, burstSize: 2 });
const VALID_STATES = new Set([
'AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA',
'KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ',
'NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT',
'VA','WA','WV','WI','WY','DC','PR','VI','GU','MP','AS','XX',
]);
export interface IrsBmfRow {
readonly ein: string;
readonly name: string;
readonly ico?: string;
readonly street?: string;
readonly city?: string;
readonly state?: string;
readonly zip?: string;
readonly subsection?: string;
readonly classification?: string;
readonly rulingYearMonth?: string;
readonly foundationCode?: string;
readonly activity?: string;
readonly status?: string;
readonly taxPeriod?: string;
readonly assetCode?: string;
readonly incomeCode?: string;
readonly filingReqCode?: string;
readonly assetAmt?: number | null;
readonly incomeAmt?: number | null;
readonly revenueAmt?: number | null;
readonly nteeCode?: string;
readonly sortName?: string;
}
export interface IrsBmfResult {
readonly state: string;
readonly einFilter?: string;
readonly nameContains?: string;
readonly totalRowsInFile: number;
readonly rowsReturned: number;
readonly rows: readonly IrsBmfRow[];
readonly sourceUrl: string;
readonly capturedAt: string;
}
export class IrsBmfConnector implements RetrievalConnector {
readonly name = 'irs-bmf';
readonly authRequired = false;
readonly rateLimit = { requestsPerSecond: 1, burstSize: 2 };
async isAvailable(): Promise<boolean> {
return true;
}
async fetch(params: FetchParams): Promise<RawPayload> {
const idRaw = params.entity?.id?.trim() ?? '';
const queryState = strParam(params.query?.state);
let state: string;
let einFilter: string | undefined;
if (queryState) {
state = queryState.toUpperCase();
einFilter = looksLikeEin(idRaw) ? normaliseEin(idRaw) : strParam(params.query?.ein);
} else if (looksLikeState(idRaw)) {
state = idRaw.toUpperCase();
einFilter = strParam(params.query?.ein);
} else if (looksLikeEin(idRaw)) {
throw new RetrievalError(
'invalid-request',
'irs-bmf: when entity.id is an EIN, query.state (2-letter) is required (no national index).',
);
} else {
throw new RetrievalError(
'invalid-request',
'irs-bmf: entity.id must be a 2-letter U.S. state code, or pass query.state.',
);
}
if (!VALID_STATES.has(state)) {
throw new RetrievalError('invalid-request', `irs-bmf: unknown state code "${state}".`);
}
const nameContains = strParam(params.query?.nameContains);
const maxRows = clampRows(params.query?.maxRows);
const result = await fetchIrsBmf({ state, ein: einFilter, nameContains, maxRows });
return {
source: this.name,
sourceUrl: result.sourceUrl,
capturedAt: result.capturedAt,
contentType: 'application/json',
rawContent: JSON.stringify(result),
metadata: {
state,
einFilter: result.einFilter ?? null,
nameContains: result.nameContains ?? null,
totalRowsInFile: result.totalRowsInFile,
rowsReturned: result.rowsReturned,
},
};
}
}
export interface IrsBmfFetchOptions {
readonly state: string;
readonly ein?: string;
readonly nameContains?: string;
readonly maxRows?: number;
}
export async function fetchIrsBmf(opts: IrsBmfFetchOptions): Promise<IrsBmfResult> {
const state = opts.state.toUpperCase();
if (!VALID_STATES.has(state)) {
throw new RetrievalError('invalid-request', `fetchIrsBmf: unknown state "${state}".`);
}
const cap = clampRows(opts.maxRows);
const einFilter = opts.ein ? normaliseEin(opts.ein) : undefined;
const nameUpper = opts.nameContains ? opts.nameContains.toUpperCase() : undefined;
const url = `${BMF_BASE}/eo_${state.toLowerCase()}.csv`;
await limiter.acquire();
let res;
try {
res = await httpGet(url, {
headers: { 'User-Agent': USER_AGENT, Accept: 'text/csv,text/plain' },
maxBodyBytes: BMF_MAX_BYTES,
});
} catch (err) {
throw translateHttpError(err, url);
}
const { totalRows, rows } = parseBmfCsv(res.body, { einFilter, nameUpper, cap });
return {
state,
einFilter,
nameContains: opts.nameContains,
totalRowsInFile: totalRows,
rowsReturned: rows.length,
rows,
sourceUrl: url,
capturedAt: new Date().toISOString(),
};
}
interface ParseOpts {
readonly einFilter?: string;
readonly nameUpper?: string;
readonly cap: number;
}
function parseBmfCsv(body: string, opts: ParseOpts): { totalRows: number; rows: IrsBmfRow[] } {
/* IRS BMF CSV: header row + data rows, fields quoted, commas inside
* quotes. We parse line-by-line with a simple CSV splitter that
* respects double-quoted fields. */
const lines = body.split(/\r?\n/);
if (lines.length === 0) return { totalRows: 0, rows: [] };
const header = parseCsvLine(lines[0]!);
const idx = (col: string): number => header.findIndex(h => h.toUpperCase() === col.toUpperCase());
const iEin = idx('EIN');
const iName = idx('NAME');
const iIco = idx('ICO');
const iStreet = idx('STREET');
const iCity = idx('CITY');
const iState = idx('STATE');
const iZip = idx('ZIP');
const iSubsection = idx('SUBSECTION');
const iClassification = idx('CLASSIFICATION');
const iRuling = idx('RULING');
const iFoundation = idx('FOUNDATION');
const iActivity = idx('ACTIVITY');
const iStatus = idx('STATUS');
const iTaxPeriod = idx('TAX_PERIOD');
const iAssetCd = idx('ASSET_CD');
const iIncomeCd = idx('INCOME_CD');
const iFilingReq = idx('FILING_REQ_CD');
const iAssetAmt = idx('ASSET_AMT');
const iIncomeAmt = idx('INCOME_AMT');
const iRevenueAmt = idx('REVENUE_AMT');
const iNtee = idx('NTEE_CD');
const iSortName = idx('SORT_NAME');
const out: IrsBmfRow[] = [];
let totalRows = 0;
for (let n = 1; n < lines.length; n++) {
const line = lines[n]!;
if (!line) continue;
totalRows++;
const cells = parseCsvLine(line);
const ein = iEin >= 0 ? (cells[iEin] ?? '').trim() : '';
const name = iName >= 0 ? (cells[iName] ?? '').trim() : '';
if (opts.einFilter && normaliseEin(ein) !== opts.einFilter) continue;
if (opts.nameUpper && !name.toUpperCase().includes(opts.nameUpper)) continue;
out.push({
ein,
name,
ico: pick(cells, iIco),
street: pick(cells, iStreet),
city: pick(cells, iCity),
state: pick(cells, iState),
zip: pick(cells, iZip),
subsection: pick(cells, iSubsection),
classification: pick(cells, iClassification),
rulingYearMonth: pick(cells, iRuling),
foundationCode: pick(cells, iFoundation),
activity: pick(cells, iActivity),
status: pick(cells, iStatus),
taxPeriod: pick(cells, iTaxPeriod),
assetCode: pick(cells, iAssetCd),
incomeCode: pick(cells, iIncomeCd),
filingReqCode: pick(cells, iFilingReq),
assetAmt: pickNum(cells, iAssetAmt),
incomeAmt: pickNum(cells, iIncomeAmt),
revenueAmt: pickNum(cells, iRevenueAmt),
nteeCode: pick(cells, iNtee),
sortName: pick(cells, iSortName),
});
if (out.length >= opts.cap) break;
}
return { totalRows, rows: out };
}
function pick(cells: readonly string[], i: number): string | undefined {
if (i < 0) return undefined;
const v = cells[i];
if (typeof v !== 'string') return undefined;
const t = v.trim();
return t.length > 0 ? t : undefined;
}
function pickNum(cells: readonly string[], i: number): number | null | undefined {
if (i < 0) return undefined;
const v = cells[i];
if (typeof v !== 'string') return undefined;
const t = v.trim();
if (!t) return null;
const n = Number(t.replace(/,/g, ''));
return Number.isFinite(n) ? n : null;
}
/** Parse one CSV line, respecting double-quoted fields with embedded
* commas and "" escapes. */
function parseCsvLine(line: string): string[] {
const out: string[] = [];
let cur = '';
let inQuotes = false;
for (let i = 0; i < line.length; i++) {
const ch = line[i]!;
if (inQuotes) {
if (ch === '"') {
if (line[i + 1] === '"') {
cur += '"';
i++;
} else {
inQuotes = false;
}
} else {
cur += ch;
}
} else {
if (ch === ',') {
out.push(cur);
cur = '';
} else if (ch === '"') {
inQuotes = true;
} else {
cur += ch;
}
}
}
out.push(cur);
return out;
}
function looksLikeState(s: string): boolean {
return /^[A-Za-z]{2}$/.test(s);
}
function looksLikeEin(s: string): boolean {
return /^\d{2}-?\d{7}$/.test(s);
}
function normaliseEin(s: string): string {
return s.replace(/-/g, '');
}
function strParam(v: unknown): string | undefined {
return typeof v === 'string' && v.trim() ? v.trim() : undefined;
}
function clampRows(raw: unknown): number {
if (typeof raw !== 'number' || !Number.isFinite(raw) || raw <= 0) return DEFAULT_MAX_ROWS;
return Math.min(Math.floor(raw), HARD_MAX_ROWS);
}
function translateHttpError(err: unknown, url: string): RetrievalError {
if (err instanceof HttpError) {
switch (err.category) {
case 'timeout':
case 'network':
case 'aborted':
return new RetrievalError('unavailable', err.message, { url });
case 'body-too-large':
return new RetrievalError('internal', err.message, { url });
case 'status':
if (err.status === 404) return new RetrievalError('no-content', err.message, { url });
if (err.status === 401 || err.status === 403) return new RetrievalError('auth-failed', err.message, { url });
if (err.status === 429) return new RetrievalError('rate-limited', err.message, { url });
if (err.status === 400) return new RetrievalError('invalid-request', err.message, { url });
return new RetrievalError('internal', err.message, { url });
}
}
return new RetrievalError('internal', err instanceof Error ? err.message : String(err), { url });
}