skip to Main Content

I have a puppeteer project with node.js and i want to find all links present in a page with their status codes. In this page, jumptastic.com/duluth/, i have intentionally placed a link (/blg_redirect/) with a 301, yet my code keeps counting it as a 200. help?

const puppeteer = require('puppeteer');
const express = require('express');

const Router = express.Router();

async function categorizeLinks(req, res, url) {
    const browser = await puppeteer.launch({ headless: 'false', args: ['--no-sandbox'] })

    const page = await browser.newPage();
    await page.setViewport({ width: 1366, height: 1068 });
    await page.goto(url);

    // Extract all links from the page
    const links = await page.$$eval('a', (anchors) => {
        return anchors.map((anchor) => anchor.href);
    });
    console.log(links);

    const categorizedLinks = {
        '200': [],
        '300': [],
        '400': [],
        '500': [],
    };

    // Iterate through each link and fetch its status code
    for (const link of links) {
        try {
            const response = await page.goto(link, { timeout: 10000 });
            const statusCode = response.status();

            // Categorize the link based on its status code
            if (statusCode >= 200 && statusCode <= 299) {
                categorizedLinks['200'].push(link);
            } else if (statusCode >= 300 && statusCode <= 399) {
                categorizedLinks['300'].push(link);
            } else if (statusCode >= 400 && statusCode <= 499) {
                categorizedLinks['400'].push(link);
            } else if (statusCode >= 500) {
                categorizedLinks['500'].push(link);
            }
        }
        catch (error) {
            console.error(`Failed to fetch link: ${link}`, error);
        }
    }
    await browser.close();

    console.log("categorizedLinks", categorizedLinks);
}


Router.get('/', async function findLinkStatus(req, res) {
    if (!req.query.url) return res.status(400).json({ error: "url is required" })

    const url = req.query.url;

    try {
        await categorizeLinks(req, res, url)
        res.status(200).json({ status: 'Pass', url: url});
        // res.status(200).json({ status: 'Pass', clientsID: Number(clientsID) });
    } catch (err) {
        console.error(err)
        // console.error(`Failed to fetch link: ${link}`, error);
        res.status(500).json({ message: "Something went wrong", status: '500' })
    }
});


module.exports = Router;

The output should find a single 301 redirect link. It finds all other status codes fine to my current knowledge.

2

Answers


  1. You must remember that puppeteer is ‘frontend’, it’s a browser.

    So when a browser encounters a redirect status code in a server response it will follow that redirect and, hopefully, the end result will be that the browser loads the page the redirect redirects to successfully, and that means we will get the status code 200.

    Login or Signup to reply.
  2. You can use interceptors like

    await page.setRequestInterception(true);
    page.on('request', request => {
      if (request.isNavigationRequest() && request.redirectChain().length)
        request.abort();
      else
        request.continue();
    });
    await page.goto('https://example.com');
    

    GitHub: How to stop puppeteer follow redirects
    #1132

    Which you can update to use request.respond

    await page.setRequestInterception(true);
    page.on('request', request => {
      if (request.isNavigationRequest() && request.redirectChain().length)
        request.respond({
          status: 300, // You might have to check this 2XX seems to be working but never tried with 3XX
          contentType: 'text/plain',
          body: 'Redirects!',
        });
      else
        request.continue();
    });
    await page.goto('https://example.com');
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search