Clickable element that updates a table rather than a page


#1

I’ve been working on a crawler to pull some financial data from Yahoo Finance. On the financials page of each company there is a button that updates the tables to display quarterly data. I’ve been trying to get the crawler to click the button and collect the data from the updated tables, but haven’t had much luck. I have tried a jQuery click() call that was suggested to me in a previous conversation with support but the DOM doesn’t actually seem to be updating. I apologize if I missed something simple. Prior to this project I have only worked in Python. Thanks in advance!

function pageFunction(context) {
// called on every page the crawler visits, use it to extract data from it
var fourcol = ['balance-sheet', 'financials', 'cash-flow'];
var $ = context.jQuery;
if (context.request.label === "START") {
    var SPREADSHEET_ID = "181ZudsbUkilFrODlXZqqtkrz2RBDm1ZzrRCWMPH3QQU";
    var NUMBER_OF_SHEETS = 1;
    var loaddata = function(id){
        var urlAPI = "https://spreadsheets.google.com/feeds/list/" + SPREADSHEET_ID + "/1/public/values?alt=json";
        $.get(urlAPI, function(data) {
            var entries = data.feed.entry;
            $.each(entries, function(index, value) {
                var url = value.title.$t;
                //context.enqueuePage(url);
            });
            context.finish();
        });
    }
    loaddata(1);
    context.skipOutput();
    context.willFinishLater();    
}

var dict = {};
var raw_company = $('h1').text().trim();
var company_name = raw_company.substring(0, raw_company.lastIndexOf(' '));
var company_ticker = raw_company.substring(raw_company.lastIndexOf('(')+1, raw_company.lastIndexOf(')'));

var annual_list = [{'Name': company_name, 'Ticker': company_ticker}];

if (fourcol.indexOf(context.request.label) >= 0){
    //annual_list = annual_list.concat([{'Name': company_name, 'Ticker': company_ticker},{'Name': company_name, 'Ticker': company_ticker},
            //{'Name': company_name, 'Ticker': company_ticker},{'Name': company_name, 'Ticker': company_ticker}]);

            
    $('table').each(function (j, tabl) {
        $(tabl).find('tr').each(function (i, row) {
            var a_row = $(row);
            var a_label = a_row.find('td:eq(0)').text().trim();
            annual_list[0][a_label +' 1'] = a_row.find('td:eq(1)').text().trim();
            annual_list[0][a_label +' 2'] = a_row.find('td:eq(2)').text().trim();
            annual_list[0][a_label +' 3'] = a_row.find('td:eq(3)').text().trim();
            annual_list[0][a_label +' 4'] = a_row.find('td:eq(4)').text().trim();
        });
    });
}
else if (context.request.label === "analysis"){
    var labels = ['EE', 'RE', 'EH', 'ET', 'ER', 'GE']
    $('table').each(function (j, tabl) {
        /*$(tabl).find('tr').each(function (i, row) {
            var a_row = $(row);
            var a_label = a_row.find('td:eq(0)').text().trim();
            annual_list[0][a_label] = a_row.find('td:eq(1)').text().trim();
            annual_list[1][a_label] = a_row.find('td:eq(2)').text().trim();
            annual_list[2][a_label] = a_row.find('td:eq(3)').text().trim();
            annual_list[3][a_label] = a_row.find('td:eq(4)').text().trim();
        });*/
    });
}
console.log("CLICKING: " + $('span:contains("Analysis")').prop('outerHTML'));
//$('span:contains("Financials")').click();
$('span:contains("Quarterly")').click();

/*setTimeout(function(){
    var quarterly_list = [{'Name': company_name, 'Ticker': company_ticker},{'Name': company_name, 'Ticker': company_ticker},
            {'Name': company_name, 'Ticker': company_ticker},{'Name': company_name, 'Ticker': company_ticker}];

    $('tr').each(function (i, row) {
        var b_row = $(row);
        var b_label = b_row.find('td:eq(0)').text().trim();
        quarterly_list[0][b_label] = b_row.find('td:eq(1)').text().trim();
        quarterly_list[1][b_label] = b_row.find('td:eq(2)').text().trim();
        quarterly_list[2][b_label] = b_row.find('td:eq(3)').text().trim();
        quarterly_list[3][b_label] = b_row.find('td:eq(4)').text().trim();
    });
    context.finish(annual_list.concat(quarterly_list));       
}, 3000);
context.willFinishLater();*/
return annual_list;

}


#2

Hi @ElliotGiles,

I checked your code and I have no idea, why click doesn’t work. You can try to extend timeout in setTimeout function. You can also try to switch on verbose log and check if you can find issue.

I recommend using actor with puppeter, where you can control browser better way. There is example how you can create simple crawler in actor. It will be easy creating crawler like that in actor.