custom-metrics
custom-metrics copied to clipboard
Avoid storing empty entries to optimise the column size in BigQuery
On the custom metrics objects just storing the whole schema itself is a big volume of data. We could prune empty values that don't hold any meaningful value.
For example, the entries that equal to
-
undefined
-
null
and potentially
-
[]
-
{}
Example: https://www.diffchecker.com/YcalWAbO/
This can be set up as a test on the custom_metrics repo PRs, to point to required optimisations in custom metrics code. Or alternatively trimmed automatically on the crawler side.
Estimated saving - 3%.
CREATE TEMP FUNCTION TRIM_EMPTY_ENTRIES(json_str STRING, remove_empty BOOL)
RETURNS STRING
LANGUAGE js AS """
function isEmpty(value) {
if (value === null || value === undefined) {
return true;
}
if (typeof value === 'string' && value.trim() === '') {
return true;
}
if (Array.isArray(value) && value.length === 0) {
return true;
}
if (typeof value === 'object' && !Array.isArray(value)) {
return Object.keys(value).length === 0;
}
return false;
}
function findEmptyEntries(obj, currentPath = '', removeEmpty = false) {
let emptyPaths = [];
if (typeof obj === 'object' && obj !== null) {
for (let key in obj) {
if (obj.hasOwnProperty(key)) {
const newPath = currentPath ? `${currentPath}.${key}` : key;
if (isEmpty(obj[key])) {
emptyPaths.push(newPath);
if (removeEmpty) {
delete obj[key];
}
} else if (typeof obj[key] === 'object') {
emptyPaths = emptyPaths.concat(findEmptyEntries(obj[key], newPath, removeEmpty));
// Remove empty objects after recursion if needed
if (removeEmpty && isEmpty(obj[key])) {
delete obj[key];
}
}
}
}
}
return emptyPaths;
}
try {
let obj = JSON.parse(json_str);
findEmptyEntries(obj, '', remove_empty);
return JSON.stringify(obj);
} catch {
return json_str;
}
""";
SELECT
page,
custom_metrics,
TRIM_EMPTY_ENTRIES(custom_metrics, true) AS cleaned_custom_metrics
FROM `all.pages` TABLESAMPLE SYSTEM (0.1 PERCENT)
WHERE date = '2024-08-01'