tools
tools copied to clipboard
get only text node
I need to get only the text nodes of an html, I do it like this in PHP
<?php
class Html
{
protected
$reachedLimit = false,
$totalLen = 0,
// $maxLen = 25,
$toRemove = array();
public static function trim($html, $maxLen = 25)
{
$dom = new DomDocument();
$dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
$instance = new static();
$toRemove = $instance->walk($dom, $maxLen);
// remove any nodes that exceed limit
foreach ($toRemove as $child) {
$child->parentNode->removeChild($child);
}
// remove wrapper tags added by DD (doctype, html...)
if (version_compare(PHP_VERSION, '5.4.0') < 0) {
// http://stackoverflow.com/a/6953808/1058140
$dom->removeChild($dom->firstChild);
$dom->replaceChild($dom->firstChild->firstChild->firstChild, $dom->firstChild);
return $dom->saveHTML();
}
return $dom->saveHTML();
}
protected function walk(DomNode $node, $maxLen)
{
if ($this->reachedLimit) {
$this->toRemove[] = $node;
} else {
// only text nodes should have text,
// so do the splitting here
if ($node instanceof DomText) {
//print_r($node instanceof DomText);
//print_r($node->nodeValue);
// echo '------';
$nodeLen = strlen($node->nodeValue);
$this->totalLen += $nodeLen ;
// use mb_strlen / mb_substr for UTF-8 support
if ($this->totalLen > $maxLen) {
$node->nodeValue = substr($node->nodeValue, 0, $nodeLen - ($this->totalLen - $maxLen)) . '...';
$this->reachedLimit = true;
}
}
// if node has children, walk its child elements
if (isset($node->childNodes)) {
foreach ($node->childNodes as $child) {
$this->walk($child, $maxLen);
}
}
}
return $this->toRemove;
}
}
$str = "<div><p><b>Lorem</b> ipsum dolor sit amet, consectetur adipisicing elit,
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. </p><p>Ut
enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip
ex ea commodo consequat. </p><p>Duis aute irure dolor in reprehenderit in
voluptate velit esse cillum dolore eu fugiat nulla pariatur. </p><p>Excepteur
sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit
anim id est laborum.</p></div>";
$str = Html::trim($str, 20);
print $str;
//result <div><p><b>Lorem</b> ipsum dolor si...</p></div>
```
I implemented it like this but the behavior is wrong
class HtmlTrim {
bool reachedLimit = false;
int totalLen = 0;
List<html.Node> toRemove = [];
static String trim(String htmlString, {int limit = 25}) {
final dom = html.parseFragment(htmlString);
var instance = HtmlTrim();
var toRemove = instance._walk(dom, limit);
// remove any nodes that exceed limit
for (var child in toRemove) {
child.parentNode?.remove();
}
return dom.outerHtml;
}
List<html.Node> _walk(html.Node node, int maxLen) {
if (reachedLimit) {
if (node.firstChild != null) {
toRemove.add(node.firstChild!);
}
} else {
// only text nodes should have text,
// so do the splitting here
if (node.firstChild?.nodeType == html.Node.TEXT_NODE) {
var nodeText = node.firstChild!;
if (nodeText.text != null) {
var nodeLen = nodeText.text!.length;
this.totalLen += nodeLen;
if (this.totalLen > maxLen) {
nodeText.text = nodeText.text!
.substring(0, nodeLen - (this.totalLen - maxLen)) +
'...';
this.reachedLimit = true;
}
}
}
// if node has children, walk its child elements
if (node.children.isNotEmpty) {
for (var child in node.children) {
_walk(child, maxLen);
}
}
}
return this.toRemove;
}
}
var html =
'''<div><p><b>Lorem</b> ipsum dolor sit amet, consectetur adipisicing elit,
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. </p><p>Ut
enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip
ex ea commodo consequat. </p><p>Duis aute irure dolor in reprehenderit in
voluptate velit esse cillum dolore eu fugiat nulla pariatur. </p><p>Excepteur
sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit
anim id est laborum.</p></div>''';
var trim = HtmlTrim.trim(html, limit: 5);
print('main: ${trim}');
// dart .\bin\teste_html_trim.dart
//Result: <div><p><b>Lorem</b> ipsum dolor sit amet, consectetur adipisicing elit,
// sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. </p><p>...</p></div>
Hi @insinfo
The issue arises from using the children property with firstChild. The children property only returns element nodes and excludes text nodes. As a result, when firstChild is called, it ignores any text nodes outside or preceding element tags (e.g., text preceding <b>), leading to skipped content.
To fix this, the nodes property should be used instead of children. The nodes property includes all child nodes (both text and element nodes), ensuring no content is missed. Here's the updated code:
class HtmlTrim {
bool reachedLimit = false;
int totalLen = 0;
List<Node> toRemove = [];
static String trim(String htmlString, {int limit = 25}) {
final dom = parseFragment(htmlString);
var instance = HtmlTrim();
var toRemove = instance._walk(dom, limit);
// Remove any nodes that exceed the limit
for (var child in toRemove) {
child.remove();
}
return dom.outerHtml;
}
List<Node> _walk(Node node, int maxLen) {
if (reachedLimit) {
toRemove.add(node);
} else {
// Only text nodes should have text,
// so do the splitting here
if (node.nodeType == Node.TEXT_NODE) {
if (node.text != null) {
var nodeLen = node.text!.length;
totalLen += nodeLen;
if (totalLen > maxLen) {
node.text =
'${node.text!.substring(0, nodeLen - (totalLen - maxLen))}...';
reachedLimit = true;
}
}
}
// If the node has children, walk through all its child nodes
if (node.nodes.isNotEmpty) {
for (var child in node.nodes) {
_walk(child, maxLen);
}
}
}
return toRemove;
}
}