Puppeteer Crawler not following redirects

#1

Hi,

Does the puppeteer crawler automatically follow redirects? I’m having trouble with the crawler crashing when it gets to a link that has multiple redirects. From everything that I’ve read about the puppeteer crawler, is that it should automatically follow redirects.

For my actor, I’m using an initial puppeteer instance to login and get the cookie, then have an new puppeteer instance start the crawl. Here’s a part of my code:

const _ = require('underscore');
const Apify = require('apify');
const utils = require('apify-shared/utilities');

// This function normalizes the URL and removes the #fragment
const normalizeUrl = (url) => {
    const nurl = utils.normalizeUrl(url);
    if (nurl) return nurl;

    const index = url.indexOf('#');
    if (index > 0) return url.substring(0, index);

    return url;
};

Apify.main(async () => {
    // Fetch input
    const input = await Apify.getValue('INPUT');
    console.log('Input:');
    console.dir(input);

    const baseUrl = normalizeUrl(input.baseUrl);

    console.log('Launching Puppeteer...');
    const browser = await Apify.launchPuppeteer();
    
    console.log(`Opening URL: ${baseUrl}`);  
    const page = await browser.newPage();
    await page.goto(baseUrl);
    await page.type('#userName', 'xxxxxxxx');
    await page.type('#password', 'xxxxxxxx');
    await page.click('.d2l-button')
    await page.waitForNavigation();
    const loginURL = page.url();
    console.log(loginURL);
    const cookiesObject = await page.cookies()
    const content = JSON.stringify(cookiesObject);
    
    
        

    const requestQueue = await Apify.openRequestQueue();
    await requestQueue.addRequest({ url: loginURL });

    const purlBase = new Apify.PseudoUrl(`${loginURL}[(|/.*)]`);

    console.log(`Starting crawl of ${loginURL}`);

    const crawler = new Apify.PuppeteerCrawler({
        requestQueue,
        maxRequestsPerCrawl: input.maxPages,
        maxRequestRetries: 3,
        maxConcurrency: input.maxConcurrency,
        puppeteerPoolOptions: {
            recycleDiskCache: true
        },
        gotoFunction: async ({ page, request}) => {
            const cookiesArr = JSON.parse(content);
            if (cookiesArr.length !== 0) {
                for (let cookie of cookiesArr) {
                await page.setCookie(cookie)
                }
                console.log('Session has been loaded in the browser')
            }
            var response = await page.goto(request.url);
            /*const requests = response.request().redirectChain();
            console.log(requests);
            for (const request of requests){
                console.log(request.response().status());
                response.url = request.frame().url()
                while (response.url() == request.frame().url()){
                    response = await page.goto(request.frame().url());
                    console.log(response.url());
                    console.log(request.frame().url());
                    console.log(response.url() == request.frame().url());
                }
                console.log(response);
            } */
            return response;
        },
        handlePageFunction: async ({ request, page, response }) => {
            const url = normalizeUrl(request.url);
            console.log(`Analysing page: ${url}`);
            const record = {
                url,
                isBaseWebsite: false,
                httpStatus: response.status(),
                title: await page.title(),
                linkUrls: null,
                anchors: null,
            }
            if (response.status() !== 200) {
                console.log('ALERT');
                console.dir(request);
                console.dir(record);
                console.dir(response);
            }
            // If we're on the base website, find links to new pages and enqueue them
            if (purlBase.matches(url)) {
                record.isBaseWebsite = true;
                console.log(`[${url}] Enqueuing links`);
                const infos = await Apify.utils.enqueueLinks({
                page,
                requestQueue,
                selector: 'a:not([href^="mailto"]):not([href^="javascript"])',
                });
                let links = _.map(infos, (info) => info.request.url).sort();
                record.linkUrls = _.uniq(links, true);
            }

            // Find all HTML element IDs and <a name="xxx"> anchors,
            // basically anything that can be addressed by #fragment
            
            record.anchors = await page.evaluate(() => {
                const anchors = [];
                document.querySelectorAll('body a[name]').forEach((elem) => {
                    const name = elem.getAttribute('name');
                    if (name) anchors.push(name);
                });
                document.querySelectorAll('body [id]').forEach((elem) => {
                    const id = elem.getAttribute('id');
                    if (id) anchors.push(id);
                });
                return anchors;
            });
            record.anchors.sort();
            record.anchors = _.uniq(record.anchors, true);

            // Save results
            await Apify.pushData(record);
        },

        // This function is called if the page processing failed more than maxRequestRetries+1 times.
        handleFailedRequestFunction: async ({ request }) => {
            const url = normalizeUrl(request.url);
            console.log(`Page failed ${request.retryCount + 1} times, giving up: ${url}`);

            await Apify.pushData({
                url,
                httpStatus: null,
                errorMessage: _.last(request.errorMessages) || 'Unkown error',
            });
        },
    });

    
    await crawler.run();

Any ideas?

Thanks,
Wilfred

#2

Hi Wilfred!

at which point it crashes? When you call page.goto() ? Could you share the log from actor (mainly the error)?

Cheers,
Marek

#3

Hi Marek,

I managed to get this fixed by adding:

 await page.setRequestInterception(true);
        page.on('request', interceptedRequest => {
            console.log('GOT NEW REQUEST', interceptedRequest.url());
            interceptedRequest.continue();
        });

To the handlePageFunction attribute.

Thanks fo the help.