Associate a tag for each crawled url

#1

In a google sheet I have a list of urls each with a tag that corresponds to one of my clients. In my current setup I use integromat to start a my crawler once per url and can then filter the results based on the tag when each crawl is done.

But I would like to simplify so that I just run the whole list of urls in one go, but I still need to get the tags associated with each url somehow.

Is this possible?

#2

Hi @simon,

I think it is possible using label for for statUrls.
You can start crawler with:

"startUrls": [
  {
    "key": "client1_tag",
    "value": "http://example.com"
  },
  {
    "key": "client2_tag",
    "value": "http://example.com"
  }
],

In page function you can get label of current request with context.request.label

#3

Hi drobnikj,

That looks like the right solution. I made a test where I just added my client tags directly in the url (as “&tag=cliet1_tag” etc., but that could easily be blocked.

I’m using the code below, and im not sure as to how I get the labels associated from my google sheet, and also where I should put the “context.request.label”. Could you point me in the right direction :slight_smile:

function pageFunction(context) {
var $ = context.jQuery;

if (context.request.label === "START") {
    var SPREADSHEET_ID = "mysecret spreadsheet";
    var NUMBER_OF_SHEETS = 1;
    var loadData = function(id) {
        var urlAPI = "https://spreadsheets.google.com/feeds/list/" + SPREADSHEET_ID + "/" + id + "/public/values?alt=json";
        $.get(urlAPI, function(data) {
            var entries = data.feed.entry;
            $.each(entries, function(index, value) {
                var url = value.title.$t;                
                context.enqueuePage(url);
            });
            if (id === NUMBER_OF_SHEETS) {
                context.finish();
            } else {
                loadData(id + 1);
            }
        });
    };
    loadData(1);
    context.skipOutput();
    context.willFinishLater();
} else {

var startedAt = Date.now();
var extractData = function() {
    if( Date.now() - startedAt > 10000 ) {
        context.finish("Timed out before #my_element was loaded");
        return;
    }

    if( $('.ci--property').length === 0 ) {  //navbar__brand
        setTimeout(extractData, 500);
        return;
    }

    context.saveSnapshot();

    var result = [];
    $('.myclass').each(function(){     
    
     result.push({
         result1 : $(this).find(".class").text().trim(),
         result2 : $(this).find(".class2").text().trim(),             
        });    
    });
    context.finish(result);
};

context.willFinishLater();

extractData();

}
}

#4

I think you need to get labels from sheet updating loadData function:

var loadData = function(id) {
        var urlAPI = "https://spreadsheets.google.com/feeds/list/" + SPREADSHEET_ID + "/" + id + "/public/values?alt=json";
        $.get(urlAPI, function(data) {
            var entries = data.feed.entry;
            $.each(entries, function(index, value) {
                var url = value.title.$t;  
                var label = value.label.$t;             
                context.enqueuePage({ url: url, label: label });
            });
            if (id === NUMBER_OF_SHEETS) {
                context.finish();
            } else {
                loadData(id + 1);
            }
        });
    };

You need to specify you column name with var label = value.label.$t;

I hope it helps

#5

Hi dribnikj,
I got it to work with your help, so big thanks! Sorry about not thanking you earlier :wink:

The final solution did need a bit of tweaking, namely:
var label = value.label.$t; did not get any results, but be checking the google sheet json i figured it should be:
var label = value.content.$t;

As I wanted the label as a part of the output for each item, I used

result.push({
myresult : $(this).find(".myitem").text().trim(),
label : context.request.label,

It works like a charm.
Best
Simon

#6

No problem,

good to heard that it works.

Have a nice day!