html icon indicating copy to clipboard operation
html copied to clipboard

get only text node

Open insinfo opened this issue 11 months ago • 1 comments

I need to get only the text nodes of an html, I do it like this in PHP

<?php

class Html
{
    protected
        $reachedLimit = false,
        $totalLen = 0,
       // $maxLen = 25,
        $toRemove = array();

    public static function trim($html, $maxLen = 25)
    {

        $dom = new DomDocument();

        
            $dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
      

        $instance = new static();
        $toRemove = $instance->walk($dom, $maxLen);

        // remove any nodes that exceed limit
        foreach ($toRemove as $child) {
            $child->parentNode->removeChild($child);
        }

        // remove wrapper tags added by DD (doctype, html...)
        if (version_compare(PHP_VERSION, '5.4.0') < 0) {
            // http://stackoverflow.com/a/6953808/1058140
            $dom->removeChild($dom->firstChild);
            $dom->replaceChild($dom->firstChild->firstChild->firstChild, $dom->firstChild);

            return $dom->saveHTML();
        }

        return $dom->saveHTML();
    }

    protected function walk(DomNode $node, $maxLen)
    {

        if ($this->reachedLimit) {
            $this->toRemove[] = $node;
        } else {
            // only text nodes should have text,
            // so do the splitting here
            if ($node instanceof DomText) {
                //print_r($node instanceof DomText);
                //print_r($node->nodeValue);
              //  echo '------';
                $nodeLen = strlen($node->nodeValue);
                $this->totalLen += $nodeLen ;

                // use mb_strlen / mb_substr for UTF-8 support
                if ($this->totalLen > $maxLen) {
                    $node->nodeValue = substr($node->nodeValue, 0, $nodeLen - ($this->totalLen - $maxLen)) . '...';
                    $this->reachedLimit = true;
                }
            }

            // if node has children, walk its child elements
            if (isset($node->childNodes)) {
                foreach ($node->childNodes as $child) {
                    $this->walk($child, $maxLen);
                }
            }
        }

        return $this->toRemove;
    }
}


$str = "<div><p><b>Lorem</b> ipsum dolor sit amet, consectetur adipisicing elit, 
            sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. </p><p>Ut 
            enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip 
            ex ea commodo consequat. </p><p>Duis aute irure dolor in reprehenderit in 
            voluptate velit esse cillum dolore eu fugiat nulla pariatur. </p><p>Excepteur 
            sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit 
            anim id est laborum.</p></div>";



$str = Html::trim($str, 20);
print $str;
//result <div><p><b>Lorem</b> ipsum dolor si...</p></div>

```

insinfo avatar Aug 11 '23 18:08 insinfo