Heading to detail page with Schema Microdata script


#1

Good evening,

Into 123pages, a directory website, I use the Apify’s Schema Microdata’s script posted in the blog.
The script works like a charm in my case and scrape every details of each listing page. But the script doesn’t head to any detail page nor scrape any information into it.

Any idea how to fix this?

Thanks in advance for any help!

John

function schemaOrgParser() {    
var extractValue = function(elem) {
    return $(elem).attr("content") || $(elem).text()
           || $(elem).attr("src") || $(elem).attr("href") || null;
};
var addProperty = function(item,propName,value) {
    if( typeof(value)==='details' )
        value = value.trim();
    if( Array.isArray(item[propName]) )
        item[propName].push(value);
    else if( typeof(item[propName])!=='undefined' )
        item[propName] = [item[propName], value];
    else
        item[propName] = value;
}
var extractItem = function(elem) {
    var item = { _type: $(elem).attr("itemtype") };
    var count = 0;
    // iterate itemprops not nested in another itemscope    
    $(elem).find("[itemprop]").filter(function() {
        return $(this).parentsUntil(elem, '[itemscope]').length === 0;
    }).each( function() {
        addProperty(
            item,
            $(this).attr("itemprop"),
            $(this).is("[itemscope]") ? extractItem(this) : extractValue(this));
        count++;
    });
    // special case - output at least something
    if( count===0 )
        addProperty(item, "_value", extractValue(elem));
    return item;
};
var extractAllItems = function() {
    var items = [];
    // find top-level itemscope elements
    $("[itemscope]").filter(function() {
        return $(this).parentsUntil("body", '[itemscope]').length === 0;
    }).each( function() {
        items.push( extractItem(this) );
    });
    return items;
};    
return extractAllItems();    }

#2

Hello @john,

It seems that the Pseudo URL you use in the detail has a regular expression outside of the brackets [ ]. Therefore, those characters are interpreted literally as part of the URL and most likely will not match anything. Try enclosing the whole regex in brackets.

Let me know if this helps.