(feat) add RE2::Set bindings
- Added native
RE2.Setbinding (iterable patterns, flags/anchor parsing, match/test/toString plus flags/sources/source/size/anchor props) and exported it from the addon. - Hoisted regex translation/escaping into shared helpers and reused across constructors.
- Documented the new API in README, expanded type definitions, and added functional + perf-oriented tests (including TS coverage).
Closes #43
| scenario | patterns | inputs | avgLen | matches | RE2.Set (ms) | RE2 list (ms) | RegExp list (ms) | notes |
|---|---|---|---|---|---|---|---|---|
| multi-small-50-hits | 50 | 4000 | 19 | 4000 | 3.711 | 8.526 | 1.045 | 4k small strings, many hits |
| multi-small-50-nohits | 50 | 4000 | 17 | 0 | 2.116 | 80.921 | 6.809 | 4k small strings, no hits |
| multi-small-200-hits | 200 | 4000 | 20 | 4000 | 2.500 | 7.008 | 0.356 | 4k small strings, many hits |
| multi-small-200-nohits | 200 | 4000 | 18 | 0 | 1.632 | 324.565 | 19.840 | 4k small strings, no hits |
| single-long-50-hits | 50 | 1 | 1677 | 50 | 0.095 | 0.294 | 0.046 | ~3 KB string, all tokens present |
| single-long-50-nohits | 50 | 1 | 1674 | 0 | 0.012 | 0.246 | 0.038 | ~3 KB string, no tokens |
| single-long-200-hits | 200 | 1 | 6977 | 200 | 0.134 | 2.647 | 0.362 | ~7 KB string, all tokens present |
| single-long-200-nohits | 200 | 1 | 6974 | 0 | 0.033 | 3.906 | 0.469 | ~8 KB string, no tokens |
Ran a small bench across different implementations. This tends to perform best in situations where your patterns do not match.
@wrmedford: Ran a small bench across different implementations.
Could you share code of your benchmark? I plan to add a benchmarking utility.
@wrmedford: Ran a small bench across different implementations.
Could you share code of your benchmark? I plan to add a benchmarking utility.
'use strict';
const RE2 = require('./re2');
function makePatterns(n) {
const arr = [];
for (let i = 0; i < n; ++i) arr.push('token' + i + '(?:[a-z]+)?');
return arr;
}
function makeInputs(patternCount, count, withHits) {
const arr = [];
for (let j = 0; j < count; ++j) {
if (withHits) {
arr.push('xx' + (j % patternCount) + ' ' + (j & 7) + ' token' + (j % patternCount) + ' tail');
} else {
arr.push('xx' + (j % patternCount) + ' ' + (j & 7) + ' tok' + (j % patternCount) + ' tail');
}
}
return arr;
}
function makeLongAllHits(patternCount) {
const parts = [];
for (let i = 0; i < patternCount; ++i) parts.push('some prefix ' + i + ' token' + i + ' suffix ' + (i & 7));
return parts.join(' | ');
}
function makeLongNoHits(patternCount) {
const parts = [];
for (let i = 0; i < patternCount; ++i) parts.push('item' + i + ' nohit ' + (i & 7));
return parts.join(' | ').repeat(2);
}
function measure(fn) {
const start = process.hrtime.bigint();
const result = fn();
const ms = Number(process.hrtime.bigint() - start) / 1e6;
return { timeMs: ms, result };
}
const configs = [
{ name: 'multi-small-50-hits', patterns: 50, inputs: makeInputs(50, 4000, true), mode: 'multi', desc: '4k small strings, many hits' },
{ name: 'multi-small-50-nohits', patterns: 50, inputs: makeInputs(50, 4000, false), mode: 'multi', desc: '4k small strings, no hits' },
{ name: 'multi-small-200-hits', patterns: 200, inputs: makeInputs(200, 4000, true), mode: 'multi', desc: '4k small strings, many hits' },
{ name: 'multi-small-200-nohits', patterns: 200, inputs: makeInputs(200, 4000, false), mode: 'multi', desc: '4k small strings, no hits' },
{ name: 'single-long-50-hits', patterns: 50, inputs: [makeLongAllHits(50)], mode: 'single', desc: '~3 KB string, all tokens present' },
{ name: 'single-long-50-nohits', patterns: 50, inputs: [makeLongNoHits(50)], mode: 'single', desc: '~3 KB string, no tokens' },
{ name: 'single-long-200-hits', patterns: 200, inputs: [makeLongAllHits(200)], mode: 'single', desc: '~7 KB string, all tokens present' },
{ name: 'single-long-200-nohits', patterns: 200, inputs: [makeLongNoHits(200)], mode: 'single', desc: '~8 KB string, no tokens' },
];
const results = [];
for (const cfg of configs) {
const patterns = makePatterns(cfg.patterns);
const set = new RE2.Set(patterns);
const re2List = patterns.map((p) => new RE2(p));
const jsList = patterns.map((p) => new RegExp(p));
const avgLen = cfg.inputs.reduce((n, s) => n + s.length, 0) / cfg.inputs.length;
if (cfg.mode === 'multi') {
const setRes = measure(() => {
let m = 0;
for (const s of cfg.inputs) m += set.test(s) ? 1 : 0;
return m;
});
const re2Res = measure(() => {
let m = 0;
for (const s of cfg.inputs) {
for (const re of re2List) { if (re.test(s)) { ++m; break; } }
}
return m;
});
const jsRes = measure(() => {
let m = 0;
for (const s of cfg.inputs) {
for (const re of jsList) { if (re.test(s)) { ++m; break; } }
}
return m;
});
results.push({ ...cfg, avgLen, inputsCount: cfg.inputs.length, set: setRes.timeMs, re2: re2Res.timeMs, js: jsRes.timeMs, matches: setRes.result });
} else {
const setRes = measure(() => set.match(cfg.inputs[0]).length);
const re2Res = measure(() => re2List.reduce((n, re) => n + (re.test(cfg.inputs[0]) ? 1 : 0), 0));
const jsRes = measure(() => jsList.reduce((n, re) => n + (re.test(cfg.inputs[0]) ? 1 : 0), 0));
results.push({ ...cfg, avgLen, inputsCount: cfg.inputs.length, set: setRes.timeMs, re2: re2Res.timeMs, js: jsRes.timeMs, matches: setRes.result });
}
}
console.table(results.map(r => ({
scenario: r.name,
patterns: r.patterns,
inputs: r.inputsCount,
avgInputLen: Math.round(r.avgLen),
matches: r.matches,
setMs: r.set.toFixed(3),
re2Ms: r.re2.toFixed(3),
jsMs: r.js.toFixed(3),
desc: r.desc,
})));
Let me know if you hit any issues.
Hey @uhop, just following up on this to check if there's any changes needed or anything I can help with to get this merged :)
Looks good, just swamped at the moment to do the proper due diligence.
Not bad. Not bad at all:
This is your perf test from the test file.