I wrote a code to scrape an array of Shopify ecommerce websites using website-scraper npm module in node.js but it is showing 403 error but the same code is working for other websites.
How can we get around this problem?
My scraperTest.js file is :
var scrape = require('website-scraper');
let test = require('./test')
let urls = [];
urlList = ['1500.academy'];
urlList.forEach(url =>{
test.checkRedirect(url)
.then(domain =>{
urls.push('https://' + domain);
console.log(urls);
var options = {
urls: urls,
directory: './autochat/',
'User-Agent': 'request',
};
// with promise
scrape(options).then((result) => {
/* some code here */
}).catch((err) => {
/* some code here */
});
// or with callback
scrape(options, (error, result) => {
/* some code here */
});
})
})
and test.js file is
const request = require('request');
const extractDomain = require('extract-domain');
//var link = 'oneplustwocase.com';
function checkRedirect(link) {
return new Promise((resolve, reject) => {
var url = "http://" + link;
var options = {
url: url,
headers: {
'User-Agent': 'request'
}
};
request(options, function (error, response, body) {
let redirectedDomain = extractDomain(response.request.uri.href);
if(response !== undefined){
extractDomain(response.request.uri.href);
if (response.statusCode === 200 && link !== redirectedDomain) {
resolve(redirectedDomain);
} else {
resolve(link);
}
} else {
resolve(link);
}
});
});
}
module.exports.checkRedirect = checkRedirect;
3
Answers
I got the solution. We are able to fetch the html data of the domain using request(); The response.body contains the html data
the solution I got by using the following code :
Since you are interested in data, save yourself the headache of scraping and simply download the site XML file. It contains all the products and interesting information, just like Google or any other search engine.
It seems that website http://1500.academy returns 403 if it doesn’t like user-agent header. I suggest to try user-agent which looks like browser
According to
website-scraper
documentation https://www.npmjs.com/package/website-scraper#request you should pass headers for request inrequest
property, not on root levelSo options should be like:
By the way
website-scraper
follows redirects by default, so you can skip checking redirects