custom-metrics icon indicating copy to clipboard operation
custom-metrics copied to clipboard

Avoid storing empty entries to optimise the column size in BigQuery

Open max-ostapenko opened this issue 5 months ago • 0 comments

On the custom metrics objects just storing the whole schema itself is a big volume of data. We could prune empty values that don't hold any meaningful value.

For example, the entries that equal to

  • undefined
  • null

and potentially

  • []
  • {}

Example: https://www.diffchecker.com/YcalWAbO/

This can be set up as a test on the custom_metrics repo PRs, to point to required optimisations in custom metrics code. Or alternatively trimmed automatically on the crawler side.

Estimated saving - 3%.

CREATE TEMP FUNCTION TRIM_EMPTY_ENTRIES(json_str STRING, remove_empty BOOL)
RETURNS STRING
LANGUAGE js AS """
function isEmpty(value) {
    if (value === null || value === undefined) {
        return true;
    }
    
    if (typeof value === 'string' && value.trim() === '') {
        return true;
    }
    
    if (Array.isArray(value) && value.length === 0) {
        return true;
    }
    
    if (typeof value === 'object' && !Array.isArray(value)) {
        return Object.keys(value).length === 0;
    }
    
    return false;
}

function findEmptyEntries(obj, currentPath = '', removeEmpty = false) {
    let emptyPaths = [];

    if (typeof obj === 'object' && obj !== null) {
        for (let key in obj) {
            if (obj.hasOwnProperty(key)) {
                const newPath = currentPath ? `${currentPath}.${key}` : key;

                if (isEmpty(obj[key])) {
                    emptyPaths.push(newPath);
                    if (removeEmpty) {
                        delete obj[key];
                    }
                } else if (typeof obj[key] === 'object') {
                    emptyPaths = emptyPaths.concat(findEmptyEntries(obj[key], newPath, removeEmpty));
                    
                    // Remove empty objects after recursion if needed
                    if (removeEmpty && isEmpty(obj[key])) {
                        delete obj[key];
                    }
                }
            }
        }
    }

    return emptyPaths;
}

try {
  let obj = JSON.parse(json_str);
  findEmptyEntries(obj, '', remove_empty);
  return JSON.stringify(obj);
} catch {
  return json_str;
}
""";

SELECT
  page,
  custom_metrics,
  TRIM_EMPTY_ENTRIES(custom_metrics, true) AS cleaned_custom_metrics
FROM `all.pages` TABLESAMPLE SYSTEM (0.1 PERCENT)
WHERE date = '2024-08-01'

max-ostapenko avatar Sep 08 '24 21:09 max-ostapenko