XHTML content is parsed out of order?
After parsing when I attempt to access the content using
feedItem.contentSnippet
the content is mixed up like this
'Start: 2024-08-06T08:45:00+00:00 Estimated end: 2024-08-08T13:00:00+00:00 We\n' +
'will be performing routine maintenance work on cloud load balancers in Helsinki.\n' +
'During this maintenance work, there may be a short connection loss from the\n' +
'active connections (TCP and HTTP) to the load balancers, or from the load\n' +
'balancers to their targets. Unfortunately, the maintenance work is taking longer\n' +
'than we planned. Thank you for your understanding. We have now started the\n' +
'maintenance work.\n' +
'In_progressIn_progress2024-08-07T15:33:29+00:002024-08-06T08:45:27+00:00',
Notice the last two "in_progress"es squashed together, and the dates also.
The actual content looks like this (see attached file)
<strong>In_progress</strong>
<small>2024-08-07T15:33:29+00:00</small>
<p>Unfortunately, the maintenance work is taking longer than we planned. Thank you for your understanding.</p>
<strong>In_progress</strong>
<small>2024-08-06T08:45:27+00:00</small>
<p>We have now started the maintenance work. </p>
My parser is defined with a custom field as
type HetznerItem = { category: { term: string } };
const hetznerParser: RSSParser = new RSSParser(new Parser<{}, HetznerItem>(
{
customFields: {
item: ['category'],
}
}
));
Hey @talonx! I'm here to help you with bug fixes, questions, and contributions. Let's tackle this issue together while we wait for a human maintainer.
To resolve the issue of your custom RSS parser outputting jumbled and out-of-order XHTML content, you should ensure that the parser processes the text nodes correctly and maintains the order of elements. Here are a few steps you can take:
-
Ensure Text Nodes are Closed Properly: The
closeTextfunction should be called appropriately to handle text nodes correctly. -
Maintain Order of Elements: Ensure that the parser maintains the order of elements as they appear in the RSS feed.
-
Handle Asynchronous Processing: If the parser processes chunks of data asynchronously, ensure that the chunks are processed in the correct order.
Here is a code snippet that highlights how to handle text nodes and maintain the order of elements:
function closeText(parser) {
parser.textNode = textopts(parser.opt, parser.textNode);
if (parser.textNode) emit(parser, 'ontext', parser.textNode);
parser.textNode = '';
}
function emitNode(parser, nodeType, data) {
if (parser.textNode) closeText(parser);
emit(parser, nodeType, data);
}
function processAsync() {
var chunk, err;
try {
if (this.remaining.length <= this.options.chunkSize) {
chunk = this.remaining;
this.remaining = '';
this.saxParser = this.saxParser.write(chunk);
return this.saxParser.close();
} else {
chunk = this.remaining.substr(0, this.options.chunkSize);
this.remaining = this.remaining.substr(this.options.chunkSize, this.remaining.length);
this.saxParser = this.saxParser.write(chunk);
return setImmediate(this.processAsync.bind(this));
}
} catch (error1) {
err = error1;
if (!this.saxParser.errThrown) {
this.saxParser.errThrown = true;
return this.emit(err);
}
}
}
Parser.prototype.reset = function () {
var attrkey, charkey, ontext, stack;
this.removeAllListeners();
this.saxParser = sax.parser(this.options.strict, {
trim: false,
normalize: false,
xmlns: this.options.xmlns
});
this.saxParser.errThrown = false;
this.saxParser.onerror = function (_this) {
return function (error) {
_this.saxParser.resume();
if (!_this.saxParser.errThrown) {
_this.saxParser.errThrown = true;
return _this.emit("error", error);
}
};
}(this);
this.saxParser.onend = function (_this) {
return function () {
if (!_this.saxParser.ended) {
_this.saxParser.ended = true;
return _this.emit("end", _this.resultObject);
}
};
}(this);
this.saxParser.ended = false;
this.EXPLICIT_CHARKEY = this.options.explicitCharkey;
this.resultObject = null;
stack = [];
attrkey = this.options.attrkey;
charkey = this.options.charkey;
this.saxParser.onopentag = function (_this) {
return function (node) {
var key, newValue, obj, processedKey, ref;
obj = Object.create(null);
obj[charkey] = "";
if (!_this.options.ignoreAttrs) {
ref = node.attributes;
for (key in ref) {
if (!hasProp.call(ref, key)) continue;
if (!(attrkey in obj) && !_this.options.mergeAttrs) {
obj[attrkey] = Object.create(null);
}
newValue = _this.options.attrValueProcessors ? processItem(_this.options.attrValueProcessors, node.attributes[key], key) : node.attributes[key];
processedKey = _this.options.attrNameProcessors ? processItem(_this.options.attrNameProcessors, key) : key;
if (_this.options.mergeAttrs) {
_this.assignOrPush(obj, processedKey, newValue);
} else {
obj[attrkey][processedKey] = newValue;
}
}
}
obj["#name"] = _this.options.tagNameProcessors ? processItem(_this.options.tagNameProcessors, node.name) : node.name;
if (_this.options.xmlns) {
obj[_this.options.xmlnskey] = {
uri: node.uri,
local: node.local
};
}
return stack.push(obj);
};
}(this);
this.saxParser.onclosetag = function (_this) {
return function () {
var cdata, emptyStr, key, node, nodeName, obj, objClone, old, s, xpath;
obj = stack.pop();
nodeName = obj["#name"];
if (!_this.options.explicitChildren || !_this.options.preserveChildrenOrder) {
delete obj["#name"];
}
if (obj.cdata === true) {
cdata = obj.cdata;
delete obj.cdata;
}
s = stack[stack.length - 1];
if (obj[charkey].match(/^\s*$/) && !cdata) {
emptyStr = obj[charkey];
delete obj[charkey];
} else {
if (_this.options.trim) {
obj[charkey] = obj[charkey].trim();
}
if (_this.options.normalize) {
obj[charkey] = obj[charkey].replace(/\s{2,}/g, " ").trim();
}
obj[charkey] = _this.options.valueProcessors ? processItem(_this.options.valueProcessors, obj[charkey], nodeName) : obj[charkey];
if (Object.keys(obj).length === 1 && charkey in obj && !_this.EXPLICIT_CHARKEY) {
obj = obj[charkey];
}
}
if (isEmpty(obj)) {
if (typeof _this.options.emptyTag === 'function') {
obj = _this.options.emptyTag();
} else {
obj = _this.options.emptyTag !== '' ? _this.options.emptyTag : emptyStr;
}
}
if (_this.options.validator != null) {
xpath = "/" + function () {
var i, len, results;
results = [];
for (i = 0, len = stack.length; i < len; i++) {
results.push(stack[i]["#name"]);
}
return results.join("/");
}();
obj = _this.options.validator(xpath, s, obj);
}
if (s) {
_this.assignOrPush(s, nodeName, obj);
} else {
_this.resultObject = obj;
_this.saxParser.ended = true;
return _this.emit("end", _this.resultObject);
}
};
}(this);
ontext = function (_this) {
return function (text) {
var charChild, s;
s = stack[stack.length - 1];
if (s) {
s[charkey] += text;
if (_this.options.explicitChildren && _this.options.preserveChildrenOrder && _this.options.charsAsChildren && (_this.options.includeWhiteChars || text.replace(/\n/g, '').trim() !== '')) {
s[_this.options.childkey] = s[_this.options.childkey] || [];
charChild = {
'#name': '__text__'
};
charChild[charkey] = text;
if (_this.options.normalize) {
charChild[charkey] = charChild[charkey].replace(/\s{2,}/g, " ").trim();
}
s[_this.options.childkey].push(charChild);
}
return s;
}
};
}(this);
this.saxParser.ontext = ontext;
return this.saxParser.oncdata = function (_this) {
return function (text) {
var s;
s = ontext(text);
if (s) {
return s.cdata = true;
}
};
}(this);
};
This code ensures that text nodes are closed properly and that the order of elements is maintained during parsing. Adjust the processAsync function to handle asynchronous processing correctly, ensuring that chunks are processed in the correct order [1].
To continue talking to Dosu, mention @dosu.