abotx
abotx copied to clipboard
Javascript rendering - detecting window.location changes
What would be your recommended way of dealing with window.location changes on the page? I'm crawling sites that have a method that looks something like the following probably to break crawlers:
function iframeOnLoad(){
var reqUrl='https://domain.com/page_i_want';
setTimeout(function() { window.location = reqUrl }, 3000);
}
<iframe onload="iframeOnload()" />
Assuming PhantonJs is rendering this, is it possible to detect url changes when window.location is set via JS? I could maybe write some custom addons but I'm not sure if this is already handled somehow.
I think I've got a starting point. If I modify the phantomJs RunScript in RunPhantomJsWithStaticWaitTime()
:
function printArgs() {
var i, ilen;
for (i = 0, ilen = arguments.length; i < ilen; ++i) {
console.log(' arguments[' + i + '] = ' + JSON.stringify(arguments[i]));
}
console.log('');
}
page.onNavigationRequested = function() {
console.log('onNavigationRequested');
printArgs.apply(this, arguments);
};
the URL shows up in the rendered javascript output as long as I set the JavascriptRenderingWaitTimeInMilliseconds
long enough. I should be able to wire in some detection somewhere in here by changing the console.log entries to output anchor href links.
Here is what I settled on, not sure if you want to include it in AbotX2 default implementation:
var system = require('system');
var inputData = system/*......code snippet excluded......*/
page.settings.userAgent = input.UserAgent;
// js redirect detection
var detectedUrls = [];
page.onNavigationRequested = function() {
if (arguments.length > 0) {
var url = arguments[0];
if (url !== 'about:blank' && url !== input.Url) {
detectedUrls.push(url);
}
}
};
page.setContent(input.RawHtml, input.Url);
window.setTimeout(function () {
var content = page.content;
if (detectedUrls && detectedUrls.length > 0) {
var detectedUrlLinksString = '';
for(var i = 0; i < detectedUrls.length; i++) {
detectedUrlLinksString = detectedUrlLinksString + '<a href=' + detectedUrls[i] + '>Redirect</a>';
}
if (new RegExp('</body>', 'i').test(content))
content = content.replace(/<\/body>/gi, detectedUrlLinksString + '</body>');
else if (new RegExp('</html>', 'i').test(content))
content = content.replace(/<\/html>/gi, '<body>' + detectedUrlLinksString + '</body></html>');
else
content = content + detectedUrlLinksString;
} else {
// no detected urls
}
console.log(content);
phantom.exit();
}, input.WaitToRenderTimeInMilliseconds);
Hi appreciate circling back to share your solution. It does seem like a reasonable approach. I don't think it's something we should add back to the main solution but lets leave this up for others to find in case they have similar needs.