node-re2 icon indicating copy to clipboard operation
node-re2 copied to clipboard

(feat) add RE2::Set bindings

Open wrmedford opened this issue 1 month ago • 5 comments

  • Added native RE2.Set binding (iterable patterns, flags/anchor parsing, match/test/toString plus flags/sources/source/size/anchor props) and exported it from the addon.
  • Hoisted regex translation/escaping into shared helpers and reused across constructors.
  • Documented the new API in README, expanded type definitions, and added functional + perf-oriented tests (including TS coverage).

Closes #43

wrmedford avatar Nov 23 '25 22:11 wrmedford

scenario patterns inputs avgLen matches RE2.Set (ms) RE2 list (ms) RegExp list (ms) notes
multi-small-50-hits 50 4000 19 4000 3.711 8.526 1.045 4k small strings, many hits
multi-small-50-nohits 50 4000 17 0 2.116 80.921 6.809 4k small strings, no hits
multi-small-200-hits 200 4000 20 4000 2.500 7.008 0.356 4k small strings, many hits
multi-small-200-nohits 200 4000 18 0 1.632 324.565 19.840 4k small strings, no hits
single-long-50-hits 50 1 1677 50 0.095 0.294 0.046 ~3 KB string, all tokens present
single-long-50-nohits 50 1 1674 0 0.012 0.246 0.038 ~3 KB string, no tokens
single-long-200-hits 200 1 6977 200 0.134 2.647 0.362 ~7 KB string, all tokens present
single-long-200-nohits 200 1 6974 0 0.033 3.906 0.469 ~8 KB string, no tokens

Ran a small bench across different implementations. This tends to perform best in situations where your patterns do not match.

wrmedford avatar Nov 23 '25 23:11 wrmedford

@wrmedford: Ran a small bench across different implementations.

Could you share code of your benchmark? I plan to add a benchmarking utility.

uhop avatar Nov 24 '25 00:11 uhop

@wrmedford: Ran a small bench across different implementations.

Could you share code of your benchmark? I plan to add a benchmarking utility.

'use strict';

const RE2 = require('./re2');

function makePatterns(n) {
  const arr = [];
  for (let i = 0; i < n; ++i) arr.push('token' + i + '(?:[a-z]+)?');
  return arr;
}

function makeInputs(patternCount, count, withHits) {
  const arr = [];
  for (let j = 0; j < count; ++j) {
    if (withHits) {
      arr.push('xx' + (j % patternCount) + ' ' + (j & 7) + ' token' + (j % patternCount) + ' tail');
    } else {
      arr.push('xx' + (j % patternCount) + ' ' + (j & 7) + ' tok' + (j % patternCount) + ' tail');
    }
  }
  return arr;
}

function makeLongAllHits(patternCount) {
  const parts = [];
  for (let i = 0; i < patternCount; ++i) parts.push('some prefix ' + i + ' token' + i + ' suffix ' + (i & 7));
  return parts.join(' | ');
}

function makeLongNoHits(patternCount) {
  const parts = [];
  for (let i = 0; i < patternCount; ++i) parts.push('item' + i + ' nohit ' + (i & 7));
  return parts.join(' | ').repeat(2);
}

function measure(fn) {
  const start = process.hrtime.bigint();
  const result = fn();
  const ms = Number(process.hrtime.bigint() - start) / 1e6;
  return { timeMs: ms, result };
}

const configs = [
  { name: 'multi-small-50-hits', patterns: 50, inputs: makeInputs(50, 4000, true), mode: 'multi', desc: '4k small strings, many hits' },
  { name: 'multi-small-50-nohits', patterns: 50, inputs: makeInputs(50, 4000, false), mode: 'multi', desc: '4k small strings, no hits' },
  { name: 'multi-small-200-hits', patterns: 200, inputs: makeInputs(200, 4000, true), mode: 'multi', desc: '4k small strings, many hits' },
  { name: 'multi-small-200-nohits', patterns: 200, inputs: makeInputs(200, 4000, false), mode: 'multi', desc: '4k small strings, no hits' },
  { name: 'single-long-50-hits', patterns: 50, inputs: [makeLongAllHits(50)], mode: 'single', desc: '~3 KB string, all tokens present' },
  { name: 'single-long-50-nohits', patterns: 50, inputs: [makeLongNoHits(50)], mode: 'single', desc: '~3 KB string, no tokens' },
  { name: 'single-long-200-hits', patterns: 200, inputs: [makeLongAllHits(200)], mode: 'single', desc: '~7 KB string, all tokens present' },
  { name: 'single-long-200-nohits', patterns: 200, inputs: [makeLongNoHits(200)], mode: 'single', desc: '~8 KB string, no tokens' },
];

const results = [];

for (const cfg of configs) {
  const patterns = makePatterns(cfg.patterns);
  const set = new RE2.Set(patterns);
  const re2List = patterns.map((p) => new RE2(p));
  const jsList = patterns.map((p) => new RegExp(p));

  const avgLen = cfg.inputs.reduce((n, s) => n + s.length, 0) / cfg.inputs.length;

  if (cfg.mode === 'multi') {
    const setRes = measure(() => {
      let m = 0;
      for (const s of cfg.inputs) m += set.test(s) ? 1 : 0;
      return m;
    });
    const re2Res = measure(() => {
      let m = 0;
      for (const s of cfg.inputs) {
        for (const re of re2List) { if (re.test(s)) { ++m; break; } }
      }
      return m;
    });
    const jsRes = measure(() => {
      let m = 0;
      for (const s of cfg.inputs) {
        for (const re of jsList) { if (re.test(s)) { ++m; break; } }
      }
      return m;
    });
    results.push({ ...cfg, avgLen, inputsCount: cfg.inputs.length, set: setRes.timeMs, re2: re2Res.timeMs, js: jsRes.timeMs, matches: setRes.result });
  } else {
    const setRes = measure(() => set.match(cfg.inputs[0]).length);
    const re2Res = measure(() => re2List.reduce((n, re) => n + (re.test(cfg.inputs[0]) ? 1 : 0), 0));
    const jsRes = measure(() => jsList.reduce((n, re) => n + (re.test(cfg.inputs[0]) ? 1 : 0), 0));
    results.push({ ...cfg, avgLen, inputsCount: cfg.inputs.length, set: setRes.timeMs, re2: re2Res.timeMs, js: jsRes.timeMs, matches: setRes.result });
  }
}

console.table(results.map(r => ({
  scenario: r.name,
  patterns: r.patterns,
  inputs: r.inputsCount,
  avgInputLen: Math.round(r.avgLen),
  matches: r.matches,
  setMs: r.set.toFixed(3),
  re2Ms: r.re2.toFixed(3),
  jsMs: r.js.toFixed(3),
  desc: r.desc,
})));

Let me know if you hit any issues.

wrmedford avatar Nov 24 '25 00:11 wrmedford

Hey @uhop, just following up on this to check if there's any changes needed or anything I can help with to get this merged :)

wrmedford avatar Dec 05 '25 20:12 wrmedford

Looks good, just swamped at the moment to do the proper due diligence.

uhop avatar Dec 05 '25 22:12 uhop

Not bad. Not bad at all:

image

This is your perf test from the test file.

uhop avatar Dec 19 '25 23:12 uhop