skip to Main Content

This is the HTML list I’m scraping:

<button class="css-4od5c4 e1ttybed2">
<div class="css-1tkalz1 e3whs8q0">
<p class="css-iqfm9l enp2lf70">Sisjön</p>
<p class="css-1cwtvfm enp2lf70">Askim</p>
<div class="css-7omsg3 e3whs8q0">
<p class="css-1n26c6q enp2lf70">Välj butik</p>
</div></div>
<div class="css-177ui4i e3whs8q0">
<p class="css-iqfm9l enp2lf70">3 st</p></div>
</button>

<button class="css-4od5c4 e1ttybed2">
<div class="css-1tkalz1 e3whs8q0">
<p class="css-iqfm9l enp2lf70">random address...</p>
<p class="css-1cwtvfm enp2lf70">some city...</p>
<div class="css-7omsg3 e3whs8q0">
<p class="css-1n26c6q enp2lf70">Välj butik</p>
</div></div>
<div class="css-177ui4i e3whs8q0">
<p class="css-iqfm9l enp2lf70">3 st</p></div>
</button>

And this is the code:

//Wait for list to stores to load then map store content

  await page.waitForSelector('.css-4od5c4');

  const storesData = await page.evaluate(async () => {
    await new Promise((resolve) => setTimeout(resolve, 5000));
    
    const stores = Array.from(document.querySelectorAll('.css-4od5c4 .css-1tkalz1'));
    const data = stores.map((item) => {
        const address = item.querySelector('.css-1tkalz1 .css-iqfm9l').textContent.trim();
        const city = item.querySelector('.css-1tkalz1 .css-1cwtvfm').textContent.trim();
        const amountElement = item.querySelector('.css-177ui4i.e3whs8q0 p.css-iqfm9l.enp2lf70');
        const amount = amountElement ? amountElement.textContent.trim() : 'no value';

        return {
            address,
            city,
            amount
        };
    });

    return data;
});

  console.log(storesData)

I can scrape the the two first p elements inside the first div but no success at scraping the p with the value "3 st" in the second div.

I am trying all the time to change the CSS selector to the amount p and I put a setTimeout to be sure that all the elements has fully loaded.

The CSS classes are much the same and I have tried to go to the webpage and copy the selector string and it did not work. I think that the problem can be solved if you are good at CSS selectors which I’m not

The array I get back is:

[
 {
    address: 'random address...',
    city: 'some city...',
    amount: 'no value'
  },
  {
    address: 'random address...',
    city: 'some city...',
    amount: 'no value'
  },
]

And what I want to get back:

[ 
 {
    address: 'random address...',
    city: 'some city...',
    amount: '3 st'
  },
  {
    address: 'random address...',
    city: 'some city...',
    amount: '3 st'
  },
]

2

Answers


  1. Ok, here is a different approach that doesn’t rely so much on the class names but rather the order of elements inside the button so even if the class names change it will keep working.
    This gives you more flexibility and also fine grain control what you want to extract. The point is to interrogate childNodes for their content.

    It’s bit verbose as I have written this in vanilla style but you can see what’s happening and you can easily tweak it and wrap it into a more compact function

    function scrape_buttons(){
    
        let stores = []
        
        let buttons = Array.from(document.querySelectorAll(".css-4od5c4"))
        
        for(let button of buttons){ // every button with ss-4od5c4 css style
            
            //prep an empty database object
            let data = {
                address:"",
                city:"",
                amount:""
            }
            let divs = Array.from(button.childNodes)
    
            for(let x = 0; x < divs.length; x++){ //every div inside that button
                let div = divs[x]
    
                if(div.nodeName == "DIV"){
                    if(div.childNodes.length > 0){ // every sub element inside 
                        let children = Array.from(div.childNodes)
    
                        // loop through every sub sub element
                        // here you can use the "i" of the loop if you only want to target a specific item by index
                        for(let i = 0; i < children.length;i++){ 
                            let child_type = children[i].nodeName //check what type of element this is
    
                            //we are only looking for the <p> elements but you can also go a level extra and interrogate the additional sub div that contains "Välj butik"
                            
                            if(child_type == "P"){ //it's a <p> element
                                
                                let p_text = children[i].innerText // grab the text of that <p> element
                                // console.log(x,i,p_text)
                                if(x == 1){ //first container div
                                    if(i == 1){
                                        data.address = p_text
                                    }else if(i == 3){
                                        data.city = p_text
                                    }
                                }else{ //last container div, the '3 st'
                                    data.amount = p_text
                                }
    
                            }else if(child_type == "DIV"){ // this is the "Välj butik" child div
                                let p_text = children[i].innerText
                            
                            
                            }
                        
                        
                        }
    
                    }
                }
    
            }
            stores.push(data)
    
        
    
        }
        console.log(stores)
    
    }
    
    scrape_buttons()
    
    Login or Signup to reply.
  2. If you format the HTML, you’ll see that the enclosing container is the button, not the div. The usual approach is to grab whatever parent element is the root of all of the data you want, then dip into child elements, using sub-parents to disambiguate as necessary:

    const puppeteer = require("puppeteer"); // ^21.0.2
    
    const html = `
    <button class="css-4od5c4 e1ttybed2">
      <div class="css-1tkalz1 e3whs8q0">
        <p class="css-iqfm9l enp2lf70">Sisjön</p>
        <p class="css-1cwtvfm enp2lf70">Askim</p>
        <div class="css-7omsg3 e3whs8q0">
          <p class="css-1n26c6q enp2lf70">Välj butik</p>
        </div>
      </div>
      <div class="css-177ui4i e3whs8q0">
        <p class="css-iqfm9l enp2lf70">3 st</p>
      </div>
    </button>
    
    <button class="css-4od5c4 e1ttybed2">
      <div class="css-1tkalz1 e3whs8q0">
        <p class="css-iqfm9l enp2lf70">random address...</p>
        <p class="css-1cwtvfm enp2lf70">some city...</p>
        <div class="css-7omsg3 e3whs8q0">
          <p class="css-1n26c6q enp2lf70">Välj butik</p>
        </div>
      </div>
      <div class="css-177ui4i e3whs8q0">
        <p class="css-iqfm9l enp2lf70">3 st</p>
      </div>
    </button>`;
    
    let browser;
    (async () => {
      browser = await puppeteer.launch();
      const [page] = await browser.pages();
      await page.setContent(html);
      const content = await page.$$eval(".css-4od5c4", els =>
        els.map(e => {
          const text = s =>
            e.querySelector(s)?.textContent.trim() ?? "no value";
          return {
            address: text(".css-iqfm9l"),
            city: text(".css-1cwtvfm"),
            amount: text(".css-177ui4i .css-iqfm9l"),
          };
        })
      );
      console.log(content);
    })()
      .catch(err => console.error(err))
      .finally(() => browser?.close());
    

    If, for some reason, you can’t rely on these CSS classes or prefer to use the n-th child of type to determine which field is which, you can:

    const content = await page.$$eval(".css-4od5c4", els =>
      els.map(e => {
        const paragraphs = [...e.querySelectorAll("p")]
          .map(e => e.textContent.trim());
        return {
          address: paragraphs[0],
          city: paragraphs[1],
          amount: paragraphs[3] ?? "no value",
        };
      })
    );
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search