Scrape and store Shopify ecommerce websites using Node.js

SureshKumarKatikala
October 5, 2018
244 views
2 votes
3 Answers

I wrote a code to scrape an array of Shopify ecommerce websites using website-scraper npm module in node.js but it is showing 403 error but the same code is working for other websites.

How can we get around this problem?

My scraperTest.js file is :

var scrape = require('website-scraper');
let test = require('./test')
let urls = [];
urlList = ['1500.academy'];
urlList.forEach(url =>{
    test.checkRedirect(url)
    .then(domain =>{
        urls.push('https://' + domain);
        console.log(urls);
        var options = {
            urls: urls,
            directory: './autochat/',
            'User-Agent': 'request',
        };

        // with promise
        scrape(options).then((result) => {
            /* some code here */
        }).catch((err) => {
            /* some code here */
        });

        // or with callback
        scrape(options, (error, result) => {
            /* some code here */
        });
    })
})

and test.js file is

const request = require('request');
const extractDomain = require('extract-domain');

//var link = 'oneplustwocase.com';

function checkRedirect(link) {
    return new Promise((resolve, reject) => {

        var url = "http://" + link;
        var options = {
            url: url,
            headers: {
                'User-Agent': 'request'
            }
        };
        request(options, function (error, response, body) {
            let redirectedDomain = extractDomain(response.request.uri.href);
            if(response !== undefined){
                extractDomain(response.request.uri.href);
                if (response.statusCode === 200 && link !== redirectedDomain) {
                   resolve(redirectedDomain);
                } else {
                    resolve(link);
                }
            } else {
                resolve(link);
            }
        });
    });
}

module.exports.checkRedirect = checkRedirect;

Answers

Chosen as BEST ANSWER

I got the solution. We are able to fetch the html data of the domain using request(); The response.body contains the html data

the solution I got by using the following code :

const request = require('request');
const extractDomain = require('extract-domain');
let fs = require('fs');

function checkRedirect(link) {
        var url = "http://" + link;
        var options = {
            url: url,
            headers: {
                'User-Agent': 'request'
            }
        };
        request(options, function (error, response, body) {

            if(response !== undefined){
                let redirectedDomain = extractDomain(response.request.uri.href);
                let writeStream = fs.createWriteStream(redirectedDomain + '.html');
                writeStream.write(response.body)
                writeStream.end();
        });
}

module.exports.checkRedirect = checkRedirect;

//checkRedirect('oneplustwocase.com')

/*
var r = request(url, function (e, resp) {
    r.uri
    resp.request.uri
  })*/

(Edit)

- DavidLazar
- October 5, 2018 at 3:23 pm
- 0 votes
0
Since you are interested in data, save yourself the headache of scraping and simply download the site XML file. It contains all the products and interesting information, just like Google or any other search engine.

Login or Signup to reply.

- s0ph1e
- October 11, 2018 at 4:34 pm
- 0 votes
0
- It seems that website http://1500.academy returns 403 if it doesn’t like user-agent header. I suggest to try user-agent which looks like browser
- According to website-scraper documentation https://www.npmjs.com/package/website-scraper#request you should pass headers for request in request property, not on root level
So options should be like:
```
const options = {
  urls:[{url: 'http://1500.academy/'}],
  directory: './autochat/',
  request: {
    headers: {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }
  }
}
```
By the way website-scraper follows redirects by default, so you can skip checking redirects
Login or Signup to reply.

Please signup or login to give your own answer.

Click here to cancel reply.