Use sitemap actor to crawl... sitemaps

Hi!

With the actor, cheerio sitemap.xml crawler, its possible to extract all urls in a sitemap.xml to then crawl. In my use case it works fine, but I have approx. 600 different sitemaps (with urls) that I need to crawl.

I found out that the webpage had a sitemapindex.xml where all the sitemap.xml links reside.
Like this:

www.example.com/sitemapindex.xml
-> inside is alle the sitemap.xml urls like this
<sitemap>
<loc> https://www.example.com/page1/sitemap.xml</loc>
</sitemap>
<sitemap>
<loc> https://www.example.com/page2/sitemap.xml</loc>
</sitemap>
<sitemap>
<loc> https://www.example.com/another-page/sitemap.xml</loc>
</sitemap>
…etc

In the sitemap there are links to sitemaps-urls like “another-page” that I don’t need to crawl, but is there any way to change the sourcecode of the actor to act like this:

Have a great day

Simon

Does anybody have an idea how to accomplish this?
I believe it would have something to do with this part (this works for one xml. What I would like is to grab all xml that are under ‘https://singlesite.sitemap.xml’).

Alternatively, how could I change the sourcecode below to accept multiple .xml urls?

Thanks

const Apify = require('apify');
const cheerio = require('cheerio');
const requestPromised = require('request-promise-native');

Apify.main(async () => {
    // Download sitemap
    const xml = await requestPromised({
        url: 'https://singlesite.sitemap.xml',
        headers: {
            'User-Agent': 'curl/7.54.0'
        }
    });
    
    // Parse sitemap and create RequestList from it
    const $ = cheerio.load(xml);
    const sources = [];
    $('loc').each(function (val) {
        const url = $(this).text().trim();
        sources.push({
            url,
            headers: {
                // NOTE: Otherwise the target doesn't allow to download the page!
                'User-Agent': 'curl/7.54.0',
            }
        });
    });

    const requestList = new Apify.RequestList({
        sources,
    });
    await requestList.initialize();
    
    // Crawl each page from sitemap
    const crawler = new Apify.CheerioCrawler({
        requestList,
        handlePageFunction: async ({ $, request }) => {
            console.log(`Processing ${request.url}...`);
            
            await Apify.pushData({
                url: request.url
                //some scraping
            });
        },
    });

    await crawler.run();
    console.log('Done.');
});

Hi @simon,

I think the best way to get all URLs from sitemap is to use BasicCrawler.
You can extract all URLs and enqueue new found sitemaps to queue.
You can check my code:

const Apify = require('apify');
const cheerio = require('cheerio');
const requestPromised = require('request-promise-native');

Apify.main(async () => {
    const requestQueue = await Apify.openRequestQueue();

    await requestQueue.addRequest({ url: 'https://singlesite.sitemap.xml' });

    // Crawl each page from sitemap
    const crawler = new Apify.BasicCrawler({
        requestQueue,
        handleRequestFunction: async ({ request }) => {
            const xml = await requestPromised({
                url: request.url,
                headers: {
                    'User-Agent': 'curl/7.54.0',
                },
            });
            const $ = cheerio.load(xml, { xmlMode: true });
            const sitemapUrls = [];
            const siteUrls = [];

            // Pick all urls from sitemap
            $('url').each(function () {
                const url = $(this).find('loc').text().trim();
                siteUrls.push(url);
            });

            // Pick all sitemap urls from sitemap
            $('sitemap').each(function () {
                const url = $(this).find('loc').text().trim();
                sitemapUrls.push(url);
            });

            for (const sitemapUrl of sitemapUrls) {
                // Enqueue all sitemap to process them
                await requestQueue.addRequest({ url: sitemapUrl });
            }

            await Apify.pushData({
                url: request.url,
                siteUrls,
            });
        },
    });

    await crawler.run();
    console.log('Done.');
});