abotx icon indicating copy to clipboard operation
abotx copied to clipboard

Javascript rendering - detecting window.location changes

Open replaysMike opened this issue 2 years ago • 3 comments

What would be your recommended way of dealing with window.location changes on the page? I'm crawling sites that have a method that looks something like the following probably to break crawlers:

function iframeOnLoad(){
  var reqUrl='https://domain.com/page_i_want';
  setTimeout(function() { window.location = reqUrl }, 3000);
}

<iframe onload="iframeOnload()" />

Assuming PhantonJs is rendering this, is it possible to detect url changes when window.location is set via JS? I could maybe write some custom addons but I'm not sure if this is already handled somehow.

replaysMike avatar Mar 30 '22 20:03 replaysMike

I think I've got a starting point. If I modify the phantomJs RunScript in RunPhantomJsWithStaticWaitTime():

function printArgs() {
    var i, ilen;
    for (i = 0, ilen = arguments.length; i < ilen; ++i) {
        console.log('    arguments[' + i + '] = ' + JSON.stringify(arguments[i]));
    }
    console.log('');
}
page.onNavigationRequested = function() {
    console.log('onNavigationRequested');
    printArgs.apply(this, arguments);
};

the URL shows up in the rendered javascript output as long as I set the JavascriptRenderingWaitTimeInMilliseconds long enough. I should be able to wire in some detection somewhere in here by changing the console.log entries to output anchor href links.

replaysMike avatar Mar 30 '22 20:03 replaysMike

Here is what I settled on, not sure if you want to include it in AbotX2 default implementation:

var system = require('system');
var inputData = system/*......code snippet excluded......*/
page.settings.userAgent = input.UserAgent;

// js redirect detection
var detectedUrls = [];
page.onNavigationRequested = function() {
    if (arguments.length > 0) {
        var url = arguments[0];
        if (url !== 'about:blank' && url !== input.Url) {
            detectedUrls.push(url);
        }
    }
};

page.setContent(input.RawHtml, input.Url);
                        
window.setTimeout(function () {
    var content = page.content;
    if (detectedUrls && detectedUrls.length > 0) {
        var detectedUrlLinksString = '';
        for(var i = 0; i < detectedUrls.length; i++) {
            detectedUrlLinksString = detectedUrlLinksString + '<a href=' + detectedUrls[i] + '>Redirect</a>';
        }
        if (new RegExp('</body>', 'i').test(content))
            content = content.replace(/<\/body>/gi, detectedUrlLinksString + '</body>');
        else if (new RegExp('</html>', 'i').test(content))
            content = content.replace(/<\/html>/gi, '<body>' + detectedUrlLinksString + '</body></html>');
        else
            content = content + detectedUrlLinksString;
    } else {
        // no detected urls
    }

    console.log(content);
    phantom.exit();
}, input.WaitToRenderTimeInMilliseconds);

replaysMike avatar Mar 30 '22 21:03 replaysMike

Hi appreciate circling back to share your solution. It does seem like a reasonable approach. I don't think it's something we should add back to the main solution but lets leave this up for others to find in case they have similar needs.

sjdirect avatar Mar 30 '22 23:03 sjdirect