api unicode: ensure 140 char text limit is unicode aware

unicode: ensure 140 char text limit is unicode aware

Open missinglink opened this issue 2 years ago • 1 comments

Pelias enforces a 140 character limit for the ?text= param for securty/performance reasons.

Prior to this PR the two methods used to implement this behaviour (namely String.prototype.length and String.prototype.substring) were not unicode aware.

The linked issue noted that this can cause problems when the substring function truncates the string in a place where a 'unicode code point' is represented by multiple 'unicode code units'.

In this case we end up slicing it such a way that the encoding becomes invalid (ie. cutting a two-byte utf8 char after the first byte).

I originally used the package stringz, which is a fairly thin wrapper around the char-regex package.

..but after having a look through the code I discovered that char-regex was ported from lodash and I managed to get a reference to the 'internal' function, so we don't need to bring in any new dependencies, plus lodash is well maintained.

note: I had a look at trying to detect the charset and handling utf16->utf8 conversion but it's very complicated, at some point I'd be happy with simply detecting and erroring on any non-utf8 charset but even that is not trivial.

Nov 02 '21 09:11 missinglink

I'm moving this to DRAFT for now pending further discussion.

The current pushed state only implements this functionality for the libpostal parser but not for pelias, the diff below applies it equally to both.

The issue I'm now having (after pulling in https://github.com/pelias/api/pull/1577) is writing a testcase for this, since it was easy enough to use emoji to make it fail, but now they are being correctly stripped. Other multi-byte characters such as Kanji have the correct length in JS and the Combining Marks are already being handled, so I can't think of any unicode range (other than using UTF-16 surrogate pairs) which would serve as a good unit test fixture. Maybe that means we don't need this 🤷‍♂️

diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js
index 89f547d4..7b419b52 100644
--- a/sanitizer/_text_pelias_parser.js
+++ b/sanitizer/_text_pelias_parser.js
@@ -1,10 +1,10 @@
+const _ = require('lodash');
 const logger = require('pelias-logger').get('api');
 const unicode = require('../helper/unicode');
 const Tokenizer = require('pelias-parser/tokenization/Tokenizer');
 const Solution = require('pelias-parser/solver/Solution');
 const AddressParser = require('pelias-parser/parser/AddressParser');
 const parser = new AddressParser();
-const _ = require('lodash');
 const MAX_TEXT_LENGTH = 140;
 
 /**
@@ -38,9 +38,9 @@ function _sanitize (raw, clean) {
   else {
 
     // truncate text to $MAX_TEXT_LENGTH chars
-    if (text.length > MAX_TEXT_LENGTH) {
+    if (unicode.length(text) > MAX_TEXT_LENGTH) {
       messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
-      text = text.substring(0, MAX_TEXT_LENGTH);
+      text = unicode.substring(text, 0, MAX_TEXT_LENGTH);
     }
 
     // parse text with pelias/parser
@@ -53,7 +53,7 @@ function _sanitize (raw, clean) {
 }
 
 function parse (clean) {
-  
+
   // parse text
   let start = new Date();
   const t = new Tokenizer(clean.text);
@@ -225,7 +225,7 @@ function parse (clean) {
       }
     }
   }
-  
+
   // unknown query type
   else {
     parsed_text.subject = t.span.body;
diff --git a/test/unit/sanitizer/_text.js b/test/unit/sanitizer/_text.js
index cb55e120..e3492722 100644
--- a/test/unit/sanitizer/_text.js
+++ b/test/unit/sanitizer/_text.js
@@ -143,9 +143,9 @@ module.exports.tests.text_parser = function(test, common) {
   test('should truncate very long text inputs', (t) => {
     const raw = { text: `
 Sometimes we make the process more complicated than we need to.
-We will never make a journey of a thousand miles by fretting about 
+We will never make a journey of a thousand miles by fretting about
 how long it will take or how hard it will be.
-We make the journey by taking each day step by step and then repeating 
+We make the journey by taking each day step by step and then repeating
 it again and again until we reach our destination.` };
     const clean = {};
     const messages = sanitizer.sanitize(raw, clean);
@@ -158,12 +158,12 @@ it again and again until we reach our destination.` };
 
   // https://github.com/pelias/api/issues/1574
   test('truncate should be unicode aware', (t) => {
-    const raw = { text: 'a' + '👩‍❤️‍👩'.repeat(200) };
+    const raw = { text: 'a' + '慶'.repeat(200) };
     const clean = {};
     const messages = sanitizer.sanitize(raw, clean);
 
     t.equals(unicode.length(clean.text), 140);
-    t.equals(clean.text, 'a' + '👩‍❤️‍👩'.repeat(139));
+    t.equals(clean.text, 'a' + '慶'.repeat(139));
     t.deepEquals(messages.errors, [], 'no errors');
     t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
     t.end();
diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js
index a169d5d9..848aded7 100644
--- a/test/unit/sanitizer/_text_pelias_parser.js
+++ b/test/unit/sanitizer/_text_pelias_parser.js
@@ -1,5 +1,5 @@
-var sanitizer = require('../../../sanitizer/_text_pelias_parser')();
-var type_mapping = require('../../../helper/type_mapping');
+const sanitizer = require('../../../sanitizer/_text_pelias_parser')();
+const unicode = require('../../../helper/unicode');
 
 module.exports.tests = {};
 
@@ -20,7 +20,7 @@ module.exports.tests.text_parser = function (test, common) {
   });
 
   let cases = [];
-  
+
   // USA queries
   cases.push(['soho, new york, NY', {
     subject: 'soho',
@@ -322,7 +322,7 @@ module.exports.tests.text_parser = function (test, common) {
   cases.push(['New York', { subject: 'New York' }, true]);
   cases.push(['New York N', { subject: 'New York' }, true]);
   cases.push(['New York NY', { subject: 'New York' }, true]);
-  
+
   cases.push(['B', { subject: 'B' }, true]);
   cases.push(['Be', { subject: 'Be' }, true]);
   cases.push(['Ber', { subject: 'Ber' }, true]);
@@ -421,9 +421,9 @@ module.exports.tests.text_parser = function (test, common) {
     const raw = {
       text: `
 Sometimes we make the process more complicated than we need to.
-We will never make a journey of a thousand miles by fretting about 
+We will never make a journey of a thousand miles by fretting about
 how long it will take or how hard it will be.
-We make the journey by taking each day step by step and then repeating 
+We make the journey by taking each day step by step and then repeating
 it again and again until we reach our destination.` };
     const clean = {};
     const messages = sanitizer.sanitize(raw, clean);
@@ -433,6 +433,19 @@ it again and again until we reach our destination.` };
     t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
     t.end();
   });
+
+  // https://github.com/pelias/api/issues/1574
+  test('truncate should be unicode aware', (t) => {
+    const raw = { text: 'a' + '慶'.repeat(200) };
+    const clean = {};
+    const messages = sanitizer.sanitize(raw, clean);
+
+    t.equals(unicode.length(clean.text), 140);
+    t.equals(clean.text, 'a' + '慶'.repeat(139));
+    t.deepEquals(messages.errors, [], 'no errors');
+    t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
+    t.end();
+  });
 };
 
 module.exports.all = function (tape, common) {

Nov 02 '21 09:11 missinglink

api api copied to clipboard

unicode: ensure 140 char text limit is unicode aware

api
api copied to clipboard