skip to Main Content

I’m scraping https://naamhinaam.com/baby-girl-names-a?page=${pageNumber} website, and after doing that so, puppeteer throwing an empty object without value. here is my code :

const puppeteer = require("puppeteer");
const express = require("express");
const cors = require("cors");
const app = express();
app.use(cors());
let data = [];
(async () => {
  const browser = await puppeteer.launch({
    headless: false,
    defaultViewport: null,
  });
  const page = await browser.newPage();
  for (let pageNumber = 1; pageNumber < 42; pageNumber++) {
    await page.goto(`https://naamhinaam.com/baby-girl-names-a?page=${pageNumber}`);
    await page.waitForTimeout(3000);
    await page.click("#promotionalPopup > div > div > div > button > span");
    await page.$eval(
      "div.name-suggestion.mt-1 > div > div:nth-child(22)",
      (el) => el.remove()
    );
    await page.$eval(
      "div.name-suggestion.mt-1 > div > div:nth-child(43)",
      (el) => el.remove()
    );
    for (let i = 3; i < 54; i++) {
      let fullName = "Null";
      if (await page.$("div.name-suggestion.mt-1 > div > div:nth-child(22)")) {
        continue;
      }          
      if (await page.$("div.name-suggestion.mt-1 > div > div:nth-child(22)")) {
        continue;
      }           
      await page.waitForSelector(
        `div.name-suggestion.mt-1 > div > div:nth-child(${i}) > div.nsg__name_meaning > a`
      );
      let element = await page.$(
        `div.name-suggestion.mt-1 > div > div:nth-child(${i}) > div.nsg__name_meaning > a`
      );
      fullName = await page.evaluate((el) => el.textContent, element);
      data.push({ fullName });
    }
    console.log(data);
  }

  await browser.close();
})();
app.get("/", (req, res) => {
  res.status(200).json(data);
});
app.listen(3000, () => {
  console.log("App is running...");
});

iam removing this element in puppeteer because it containing ad::

await page.$eval(
          "div.name-suggestion.mt-1 > div > div:nth-child(22)",
          (el) => el.remove()
        );
        await page.$eval(
          "div.name-suggestion.mt-1 > div > div:nth-child(43)",
          (el) => el.remove()
        );

I’m looping pages and getting data here. But after I’m getting an empty array.

2

Answers


  1. Assuming you are trying to extract baby Names and Meaning you can use below code, i have updated the locator and removed clicking the popup as its not required since we are only extracting the content

    const puppeteer = require("puppeteer");
    const express = require("express");
    const cors = require("cors");
    const app = express();
    app.use(cors());
    let data = [];
    (async () => {
      const browser = await puppeteer.launch({
        headless: true,
        defaultViewport: null,
      });
      const page = await browser.newPage();
      for (let pageNumber = 1; pageNumber <= 42; pageNumber++) {
        await page.goto(`https://naamhinaam.com/baby-girl-names-a?page=${pageNumber}`);
        await page.waitForTimeout(3000);
        let nameElements = await page.$$(
          `a.nsg__name`
        );
        let meaningElements = await page.$$(
          `div.nsg__meaning > i`
        );
    
        for (let i = 0; i < nameElements.length; i++) {
          let fullName = "";
          let name = await page.evaluate(el => el.textContent, nameElements[i])
          let meaning = await page.evaluate(el => el.textContent, meaningElements[i])
          fullName = `${name.split(/[nt]/).join('').trim()}, ${meaning}`;
          data.push({ fullName });
        }
    
        
      }
      console.log(data);
      await browser.close();
    })();
    app.get("/", (req, res) => {
      res.status(200).json(data);
    });
    
    app.listen(3000, () => {
      console.log("App is running...");
    });
    

    Outputs

     { fullName: 'Aamuktha, Liberated' },
      { fullName: 'Aanadhitha, Happy one' },
      around 2087 in total
    
    Login or Signup to reply.
  2. There is a :has() CSS pseudo class that you can use instead of removing elements, read about it here, note that it doesn’t work with Firefox, but with Chromium that puppeteer uses it works fine.
    So this

    let suggestions = await page.$$('div.nsg__list:has(div.nsg__name_meaning)');
    

    gets the list removing the elements you don’t want from it.

    The popup, that you’re trying to close doesn’t block you from getting data from the pages so you don’t need to click it.

    page.waitForTimeout() method is obsolete instead use page.waitForSelector().

    await page.waitForSelector('body');
    

    The popup doesn’t seem to block anything so you don’t need to do anything with it.

    You also have an error in your for loop, you aren’t getting the last page so pageNumber < 42 should be pageNumber <= 42;

    Code :

    const puppeteer = require("puppeteer");
    let data = [];
    
    (async () => {
        const browser = await puppeteer.launch({headless: false, defaultViewport: null});
        const page = await browser.newPage();
        // Skipable start
        await page.setRequestInterception(true);
        page.on('request', (req) => /image|imageset|media|stylesheet|font|script/.test(req.resourceType()) && !req.isInterceptResolutionHandled() 
            ? req.respond({status: 200, body: 'aborted'}) 
            : req.continue()
        );
        // Skipable end 
        let t0 = performance.now();
    
        // relative parts start
    
        let url = `https://naamhinaam.com/baby-girl-names-a`
        let gotoSettings = {waitUntil: "load", timeout: 70000};    
        
        await page.goto(url, gotoSettings);
        await page.waitForSelector('body');
    
        // get last page number
        let lastPage = await page.$eval('.page_info', el => el.textContent.trim());
        lastPage = +lastPage.replace('Viewing page', '').split('of').pop().trim();
    
        // get data from first page
        let data = [];
        let suggestions = await page.$$('div.nsg__list:has(div.nsg__name_meaning)'); 
            
        for(let s of suggestions) {
            let name = await s.$$eval('a.nsg__name', el => el.map(x => { return { name : x.textContent.trim(), link : x.getAttribute('href')}}));
            let meaning = await s.$eval('div.nsg__meaning', el => el.textContent.trim());
            data.push({name: name[0].name, link: name[0].link, meaning: meaning });
        }
    
        // get data from other pages
        if (lastPage > 1) {
            for (let pageNumber = 2; pageNumber <= lastPage; pageNumber++) {
                await page.goto(`${url}?page=${pageNumber}`, );
                await page.waitForSelector('body');
    
                let suggestions = await page.$$('div.nsg__list:has(div.nsg__name_meaning)'); 
                
                for(let s of suggestions) {
                    let name = await s.$$eval('a.nsg__name', el => el.map(x => { return { name : x.textContent.trim(), link : x.getAttribute('href')}}));
                    let meaning = await s.$eval('div.nsg__meaning', el => el.textContent.trim());
                    data.push({name: name[0].name, link: name[0].link, meaning: meaning });
                }
            }
        }
    
        await browser.close();
        // relative parts end
    
        let t1 = performance.now();
        console.log(data);
        console.log(t1 - t0, 'milliseconds');
    
    })();
    

    Note : The part from //Skipable start to // Skipable end bypasses loading elements stated in the regex to speed things up.

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search