skip to Main Content

here’s the HTML code.

<div class="list-row">
    <div class="list-item">
        <div class="imgframe">
            <div class="img-wrap">
                <div class="img-item">
                    <a href="">
                        <img src="img1">
                    </a>
                    <div class="in-lable">
                        <a href="link1">
                            <span class="title">title1</span>
                        </a>
                    </div>
                </div>
                <div class="img-item">
                    <a href="">
                        <img src="img2">
                    </a>
                    <div class="in-lable">
                        <a href="link2">
                            <span class="title">title2</span>
                        </a>
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>

Here’s my puppeteer code.

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

(async () => {
    const browser = await puppeteer.launch({
    headless: false,
    targetFilter: (target) => target.type() !== "other",
});

    const page = await browser.newPage();
    
    try {
      var url = 'https://booktoki351.com/novel?book=%EC%9D%BC%EB%B0%98%EC%86%8C%EC%84%A4';
      await page.goto(url, {'timeout': 50000, 'waitUntil':'load'});
      await page.waitForSelector('#webtoon-list');

      const titlesAndImage = await page.evaluate(() => {
        const listItems = Array.from(document.querySelectorAll('.img-item'));
        return listItems.map((list) => {
           const img = list.querySelector('img').src;
           const link = list.querySelector('.in-lable').getAttribute('href');
           const title = list.querySelector('.in-lable').textContent;
      
           return { title, img, link };
        })
      });
    } catch (error) {
      console.log(error);
  } finally {
      console.log('done');
      await browser.close();
  }

})();

And here’s my console.log testing.

Array.from(document.querySelectorAll('.img-item')).map((itemlist) => itemlist.querySelector('img').src);
Array.from(document.querySelectorAll('.img-item span.title')).map((itemlist) => itemlist.textContent);

both are working but I wanted to get them inside 1 map which is inside the img-item or whatever selector can be used based on the example HTML.

I’m expecting an output like this.

[
{
title: ‘title1’,
img: ‘img1’,
link: ‘link1’
},
{
title: ‘title2’,
img: ‘img2’,
link: ‘link2’
},
]

2

Answers


  1. Chosen as BEST ANSWER

    Currently this is what I did I separate getting the title and link to image.

    Maybe someone can help me merge them together on one process.

    const puppeteer = require("puppeteer-extra");
    const StealthPlugin = require("puppeteer-extra-plugin-stealth");
    
    puppeteer.use(StealthPlugin());
    
    (async () => {
      const browser = await puppeteer.launch({
      headless: false,
      targetFilter: (target) => target.type() !== "other",
    });
    
      const page = await browser.newPage();
    
      try {
        var url = 'https://booktoki351.com/novel?book=%EC%9D%BC%EB%B0%98%EC%86%8C%EC%84%A4';
        await page.goto(url, {'timeout': 50000, 'waitUntil':'load'});
        await page.waitForSelector('#webtoon-list');
    
        // get the title and link
        const titlesAndLink = await Promise.all(
          (await page.$$("div.in-lable")).map((elem) =>
            elem.evaluate(function (e) {
              return {
                title: e.querySelector("span.title").innerText,
                link: e.querySelector("a").href
              };
            })
          )
        );
        // console.log(titlesAndLink);
    
        // get the image
        const imageLink = await Promise.all(
          (await page.$$("div.img-item")).map((elem) =>
            elem.evaluate(function (e) {
              return {
                img: e.querySelector("img").src
              };
            })
          )
        );
        // console.log(imageLink);
    
        console.log(titlesAndLink[0]['title'], titlesAndLink[0]['link'], imageLink[0]['img']);
      // console.log(titlesAndImage);
    
    } catch (error) {
      console.log(error);
    } finally {
      console.log('done');
      await browser.close();
    }
    
    })();
    

  2. The following solution uses puppeteer’s "multi-selector" $$ to retrieve all elements with class img-item with one asynchronous operation. It then starts parallel asynchronous operations per elem, which compute the DOM representation e and access its desired properties with synchronous DOM operations querySelector.

    const titlesAndImage = await Promise.all(
      (await page.$$(".img-item")).map((elem) =>
        elem.evaluate(function (e) {
          return {
            title: e.querySelector(".in-lable span").textContent,
            img: e.querySelector("img").src,
            link: e.querySelector(".in-lable a").href
          };
        })
      )
    );
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search