skip to Main Content

I wrote a code to scrape an array of Shopify ecommerce websites using website-scraper npm module in node.js but it is showing 403 error but the same code is working for other websites.

How can we get around this problem?

My scraperTest.js file is :

var scrape = require('website-scraper');
let test = require('./test')
let urls = [];
urlList = ['1500.academy'];
urlList.forEach(url =>{
    test.checkRedirect(url)
    .then(domain =>{
        urls.push('https://' + domain);
        console.log(urls);
        var options = {
            urls: urls,
            directory: './autochat/',
            'User-Agent': 'request',
        };

        // with promise
        scrape(options).then((result) => {
            /* some code here */
        }).catch((err) => {
            /* some code here */
        });

        // or with callback
        scrape(options, (error, result) => {
            /* some code here */
        });
    })
})

and test.js file is

const request = require('request');
const extractDomain = require('extract-domain');

//var link = 'oneplustwocase.com';

function checkRedirect(link) {
    return new Promise((resolve, reject) => {

        var url = "http://" + link;
        var options = {
            url: url,
            headers: {
                'User-Agent': 'request'
            }
        };
        request(options, function (error, response, body) {
            let redirectedDomain = extractDomain(response.request.uri.href);
            if(response !== undefined){
                extractDomain(response.request.uri.href);
                if (response.statusCode === 200 && link !== redirectedDomain) {
                   resolve(redirectedDomain);
                } else {
                    resolve(link);
                }
            } else {
                resolve(link);
            }
        });
    });
}

module.exports.checkRedirect = checkRedirect;

3

Answers


  1. Chosen as BEST ANSWER

    I got the solution. We are able to fetch the html data of the domain using request(); The response.body contains the html data

    the solution I got by using the following code :

    const request = require('request');
    const extractDomain = require('extract-domain');
    let fs = require('fs');
    
    function checkRedirect(link) {
            var url = "http://" + link;
            var options = {
                url: url,
                headers: {
                    'User-Agent': 'request'
                }
            };
            request(options, function (error, response, body) {
    
                if(response !== undefined){
                    let redirectedDomain = extractDomain(response.request.uri.href);
                    let writeStream = fs.createWriteStream(redirectedDomain + '.html');
                    writeStream.write(response.body)
                    writeStream.end();
            });
    }
    
    module.exports.checkRedirect = checkRedirect;
    
    //checkRedirect('oneplustwocase.com')
    
    /*
    var r = request(url, function (e, resp) {
        r.uri
        resp.request.uri
      })*/
    

  2. Since you are interested in data, save yourself the headache of scraping and simply download the site XML file. It contains all the products and interesting information, just like Google or any other search engine.

    Login or Signup to reply.
  3. So options should be like:

    const options = {
      urls:[{url: 'http://1500.academy/'}],
      directory: './autochat/',
      request: {
        headers: {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        }
      }
    }
    

    By the way website-scraper follows redirects by default, so you can skip checking redirects

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search