I have a puppeteer project with node.js and i want to find all links present in a page with their status codes. In this page, jumptastic.com/duluth/, i have intentionally placed a link (/blg_redirect/) with a 301, yet my code keeps counting it as a 200. help?
const puppeteer = require('puppeteer');
const express = require('express');
const Router = express.Router();
async function categorizeLinks(req, res, url) {
const browser = await puppeteer.launch({ headless: 'false', args: ['--no-sandbox'] })
const page = await browser.newPage();
await page.setViewport({ width: 1366, height: 1068 });
await page.goto(url);
// Extract all links from the page
const links = await page.$$eval('a', (anchors) => {
return anchors.map((anchor) => anchor.href);
});
console.log(links);
const categorizedLinks = {
'200': [],
'300': [],
'400': [],
'500': [],
};
// Iterate through each link and fetch its status code
for (const link of links) {
try {
const response = await page.goto(link, { timeout: 10000 });
const statusCode = response.status();
// Categorize the link based on its status code
if (statusCode >= 200 && statusCode <= 299) {
categorizedLinks['200'].push(link);
} else if (statusCode >= 300 && statusCode <= 399) {
categorizedLinks['300'].push(link);
} else if (statusCode >= 400 && statusCode <= 499) {
categorizedLinks['400'].push(link);
} else if (statusCode >= 500) {
categorizedLinks['500'].push(link);
}
}
catch (error) {
console.error(`Failed to fetch link: ${link}`, error);
}
}
await browser.close();
console.log("categorizedLinks", categorizedLinks);
}
Router.get('/', async function findLinkStatus(req, res) {
if (!req.query.url) return res.status(400).json({ error: "url is required" })
const url = req.query.url;
try {
await categorizeLinks(req, res, url)
res.status(200).json({ status: 'Pass', url: url});
// res.status(200).json({ status: 'Pass', clientsID: Number(clientsID) });
} catch (err) {
console.error(err)
// console.error(`Failed to fetch link: ${link}`, error);
res.status(500).json({ message: "Something went wrong", status: '500' })
}
});
module.exports = Router;
The output should find a single 301 redirect link. It finds all other status codes fine to my current knowledge.
2
Answers
You must remember that puppeteer is ‘frontend’, it’s a browser.
So when a browser encounters a redirect status code in a server response it will follow that redirect and, hopefully, the end result will be that the browser loads the page the redirect redirects to successfully, and that means we will get the status code 200.
You can use interceptors like
GitHub: How to stop puppeteer follow redirects
#1132
Which you can update to use
request.respond