BID · Console
Baseline · Intelligence · Decision
src/tools/retrieval/connectors/irs-bmf.ts 11,981 bytes · typescript
/**
 * IRS Business Master File (Exempt Organizations) connector.
 *
 * Returns rows from the IRS Exempt Organizations Business Master File
 * (EO-BMF) — a public CSV dump of all 501(c) organizations, published
 * per-state at:
 *   https://www.irs.gov/pub/irs-soi/eo_{state}.csv
 *
 * (Plus eo_xx.csv for non-U.S. territories and eo_pr.csv for Puerto
 * Rico.) No authentication required.
 *
 * Inputs:
 *   - params.entity.id            either a 2-letter state code (e.g. "NY",
 *                                 "CA") or an EIN. If an EIN is given,
 *                                 params.query.state MUST also be set to
 *                                 the state the entity files in — there
 *                                 is no national EIN index.
 *   - params.query.state          2-letter state code; required when
 *                                 entity.id is an EIN.
 *   - params.query.ein            optional EIN to filter to.
 *   - params.query.nameContains   optional case-insensitive substring
 *                                 match on the organization name.
 *   - params.query.maxRows        default 200, hard cap 5000.
 *
 * The connector downloads the state CSV once per call (the files are
 * not paginated upstream), parses rows, and applies the filters in
 * memory. Returned payload is JSON, not CSV — already structured.
 */

import {
  RetrievalError,
  type FetchParams,
  type RawPayload,
  type RetrievalConnector,
} from '../interface.js';
import { httpGet, HttpError } from '../http-client.js';
import { RateLimiter } from '../rate-limiter.js';

const USER_AGENT = 'MR mitchell.roy@sia-partners.com';
const BMF_BASE = 'https://www.irs.gov/pub/irs-soi';
const BMF_MAX_BYTES = 128 * 1024 * 1024;

const DEFAULT_MAX_ROWS = 200;
const HARD_MAX_ROWS = 5000;

const limiter = new RateLimiter({ requestsPerSecond: 1, burstSize: 2 });

const VALID_STATES = new Set([
  'AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA',
  'KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ',
  'NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT',
  'VA','WA','WV','WI','WY','DC','PR','VI','GU','MP','AS','XX',
]);

export interface IrsBmfRow {
  readonly ein: string;
  readonly name: string;
  readonly ico?: string;
  readonly street?: string;
  readonly city?: string;
  readonly state?: string;
  readonly zip?: string;
  readonly subsection?: string;
  readonly classification?: string;
  readonly rulingYearMonth?: string;
  readonly foundationCode?: string;
  readonly activity?: string;
  readonly status?: string;
  readonly taxPeriod?: string;
  readonly assetCode?: string;
  readonly incomeCode?: string;
  readonly filingReqCode?: string;
  readonly assetAmt?: number | null;
  readonly incomeAmt?: number | null;
  readonly revenueAmt?: number | null;
  readonly nteeCode?: string;
  readonly sortName?: string;
}

export interface IrsBmfResult {
  readonly state: string;
  readonly einFilter?: string;
  readonly nameContains?: string;
  readonly totalRowsInFile: number;
  readonly rowsReturned: number;
  readonly rows: readonly IrsBmfRow[];
  readonly sourceUrl: string;
  readonly capturedAt: string;
}

export class IrsBmfConnector implements RetrievalConnector {
  readonly name = 'irs-bmf';
  readonly authRequired = false;
  readonly rateLimit = { requestsPerSecond: 1, burstSize: 2 };

  async isAvailable(): Promise<boolean> {
    return true;
  }

  async fetch(params: FetchParams): Promise<RawPayload> {
    const idRaw = params.entity?.id?.trim() ?? '';
    const queryState = strParam(params.query?.state);
    let state: string;
    let einFilter: string | undefined;

    if (queryState) {
      state = queryState.toUpperCase();
      einFilter = looksLikeEin(idRaw) ? normaliseEin(idRaw) : strParam(params.query?.ein);
    } else if (looksLikeState(idRaw)) {
      state = idRaw.toUpperCase();
      einFilter = strParam(params.query?.ein);
    } else if (looksLikeEin(idRaw)) {
      throw new RetrievalError(
        'invalid-request',
        'irs-bmf: when entity.id is an EIN, query.state (2-letter) is required (no national index).',
      );
    } else {
      throw new RetrievalError(
        'invalid-request',
        'irs-bmf: entity.id must be a 2-letter U.S. state code, or pass query.state.',
      );
    }
    if (!VALID_STATES.has(state)) {
      throw new RetrievalError('invalid-request', `irs-bmf: unknown state code "${state}".`);
    }
    const nameContains = strParam(params.query?.nameContains);
    const maxRows = clampRows(params.query?.maxRows);
    const result = await fetchIrsBmf({ state, ein: einFilter, nameContains, maxRows });
    return {
      source: this.name,
      sourceUrl: result.sourceUrl,
      capturedAt: result.capturedAt,
      contentType: 'application/json',
      rawContent: JSON.stringify(result),
      metadata: {
        state,
        einFilter: result.einFilter ?? null,
        nameContains: result.nameContains ?? null,
        totalRowsInFile: result.totalRowsInFile,
        rowsReturned: result.rowsReturned,
      },
    };
  }
}

export interface IrsBmfFetchOptions {
  readonly state: string;
  readonly ein?: string;
  readonly nameContains?: string;
  readonly maxRows?: number;
}

export async function fetchIrsBmf(opts: IrsBmfFetchOptions): Promise<IrsBmfResult> {
  const state = opts.state.toUpperCase();
  if (!VALID_STATES.has(state)) {
    throw new RetrievalError('invalid-request', `fetchIrsBmf: unknown state "${state}".`);
  }
  const cap = clampRows(opts.maxRows);
  const einFilter = opts.ein ? normaliseEin(opts.ein) : undefined;
  const nameUpper = opts.nameContains ? opts.nameContains.toUpperCase() : undefined;

  const url = `${BMF_BASE}/eo_${state.toLowerCase()}.csv`;
  await limiter.acquire();
  let res;
  try {
    res = await httpGet(url, {
      headers: { 'User-Agent': USER_AGENT, Accept: 'text/csv,text/plain' },
      maxBodyBytes: BMF_MAX_BYTES,
    });
  } catch (err) {
    throw translateHttpError(err, url);
  }

  const { totalRows, rows } = parseBmfCsv(res.body, { einFilter, nameUpper, cap });

  return {
    state,
    einFilter,
    nameContains: opts.nameContains,
    totalRowsInFile: totalRows,
    rowsReturned: rows.length,
    rows,
    sourceUrl: url,
    capturedAt: new Date().toISOString(),
  };
}

interface ParseOpts {
  readonly einFilter?: string;
  readonly nameUpper?: string;
  readonly cap: number;
}

function parseBmfCsv(body: string, opts: ParseOpts): { totalRows: number; rows: IrsBmfRow[] } {
  /* IRS BMF CSV: header row + data rows, fields quoted, commas inside
   * quotes. We parse line-by-line with a simple CSV splitter that
   * respects double-quoted fields. */
  const lines = body.split(/\r?\n/);
  if (lines.length === 0) return { totalRows: 0, rows: [] };
  const header = parseCsvLine(lines[0]!);
  const idx = (col: string): number => header.findIndex(h => h.toUpperCase() === col.toUpperCase());
  const iEin = idx('EIN');
  const iName = idx('NAME');
  const iIco = idx('ICO');
  const iStreet = idx('STREET');
  const iCity = idx('CITY');
  const iState = idx('STATE');
  const iZip = idx('ZIP');
  const iSubsection = idx('SUBSECTION');
  const iClassification = idx('CLASSIFICATION');
  const iRuling = idx('RULING');
  const iFoundation = idx('FOUNDATION');
  const iActivity = idx('ACTIVITY');
  const iStatus = idx('STATUS');
  const iTaxPeriod = idx('TAX_PERIOD');
  const iAssetCd = idx('ASSET_CD');
  const iIncomeCd = idx('INCOME_CD');
  const iFilingReq = idx('FILING_REQ_CD');
  const iAssetAmt = idx('ASSET_AMT');
  const iIncomeAmt = idx('INCOME_AMT');
  const iRevenueAmt = idx('REVENUE_AMT');
  const iNtee = idx('NTEE_CD');
  const iSortName = idx('SORT_NAME');

  const out: IrsBmfRow[] = [];
  let totalRows = 0;
  for (let n = 1; n < lines.length; n++) {
    const line = lines[n]!;
    if (!line) continue;
    totalRows++;
    const cells = parseCsvLine(line);
    const ein = iEin >= 0 ? (cells[iEin] ?? '').trim() : '';
    const name = iName >= 0 ? (cells[iName] ?? '').trim() : '';
    if (opts.einFilter && normaliseEin(ein) !== opts.einFilter) continue;
    if (opts.nameUpper && !name.toUpperCase().includes(opts.nameUpper)) continue;
    out.push({
      ein,
      name,
      ico: pick(cells, iIco),
      street: pick(cells, iStreet),
      city: pick(cells, iCity),
      state: pick(cells, iState),
      zip: pick(cells, iZip),
      subsection: pick(cells, iSubsection),
      classification: pick(cells, iClassification),
      rulingYearMonth: pick(cells, iRuling),
      foundationCode: pick(cells, iFoundation),
      activity: pick(cells, iActivity),
      status: pick(cells, iStatus),
      taxPeriod: pick(cells, iTaxPeriod),
      assetCode: pick(cells, iAssetCd),
      incomeCode: pick(cells, iIncomeCd),
      filingReqCode: pick(cells, iFilingReq),
      assetAmt: pickNum(cells, iAssetAmt),
      incomeAmt: pickNum(cells, iIncomeAmt),
      revenueAmt: pickNum(cells, iRevenueAmt),
      nteeCode: pick(cells, iNtee),
      sortName: pick(cells, iSortName),
    });
    if (out.length >= opts.cap) break;
  }
  return { totalRows, rows: out };
}

function pick(cells: readonly string[], i: number): string | undefined {
  if (i < 0) return undefined;
  const v = cells[i];
  if (typeof v !== 'string') return undefined;
  const t = v.trim();
  return t.length > 0 ? t : undefined;
}

function pickNum(cells: readonly string[], i: number): number | null | undefined {
  if (i < 0) return undefined;
  const v = cells[i];
  if (typeof v !== 'string') return undefined;
  const t = v.trim();
  if (!t) return null;
  const n = Number(t.replace(/,/g, ''));
  return Number.isFinite(n) ? n : null;
}

/** Parse one CSV line, respecting double-quoted fields with embedded
 *  commas and "" escapes. */
function parseCsvLine(line: string): string[] {
  const out: string[] = [];
  let cur = '';
  let inQuotes = false;
  for (let i = 0; i < line.length; i++) {
    const ch = line[i]!;
    if (inQuotes) {
      if (ch === '"') {
        if (line[i + 1] === '"') {
          cur += '"';
          i++;
        } else {
          inQuotes = false;
        }
      } else {
        cur += ch;
      }
    } else {
      if (ch === ',') {
        out.push(cur);
        cur = '';
      } else if (ch === '"') {
        inQuotes = true;
      } else {
        cur += ch;
      }
    }
  }
  out.push(cur);
  return out;
}

function looksLikeState(s: string): boolean {
  return /^[A-Za-z]{2}$/.test(s);
}

function looksLikeEin(s: string): boolean {
  return /^\d{2}-?\d{7}$/.test(s);
}

function normaliseEin(s: string): string {
  return s.replace(/-/g, '');
}

function strParam(v: unknown): string | undefined {
  return typeof v === 'string' && v.trim() ? v.trim() : undefined;
}

function clampRows(raw: unknown): number {
  if (typeof raw !== 'number' || !Number.isFinite(raw) || raw <= 0) return DEFAULT_MAX_ROWS;
  return Math.min(Math.floor(raw), HARD_MAX_ROWS);
}

function translateHttpError(err: unknown, url: string): RetrievalError {
  if (err instanceof HttpError) {
    switch (err.category) {
      case 'timeout':
      case 'network':
      case 'aborted':
        return new RetrievalError('unavailable', err.message, { url });
      case 'body-too-large':
        return new RetrievalError('internal', err.message, { url });
      case 'status':
        if (err.status === 404) return new RetrievalError('no-content', err.message, { url });
        if (err.status === 401 || err.status === 403) return new RetrievalError('auth-failed', err.message, { url });
        if (err.status === 429) return new RetrievalError('rate-limited', err.message, { url });
        if (err.status === 400) return new RetrievalError('invalid-request', err.message, { url });
        return new RetrievalError('internal', err.message, { url });
    }
  }
  return new RetrievalError('internal', err instanceof Error ? err.message : String(err), { url });
}