skip to Main Content

I want to download the whole table data from this webpage. It has five sub tables having its own thead and tbody each, under ‘table[class="calendar"]’. My codes below can pull all thead texts and tbody texts as whole, but doesn’t group them by each sub table.

I want to pull thead texts and tbody texts from each sub tables and then combine them to one table so that I can have the whole organized table data just in the same arrangement as the webpage shows. How can I do that? Thank you!

function test() {
  const url = "https://finviz.com/calendar.ashx";
  const res = UrlFetchApp.fetch(url, {
    muteHttpExceptions: true,
  }).getContentText();
  const $ = Cheerio.load(res);

  var thead = $("thead")
    .find("th")
    .toArray()
    .map(el => $(el).text());
  var tableHead = [],
    column = 9;
  while (thead.length) tableHead.push(thead.splice(0, column)); //Convert 1D array to 2D array
  console.log(tableHead);

  var tcontents = $("body > div.content")
    .find("td")
    .toArray()
    .map(el => $(el).text());
  var idx = tcontents.indexOf("No economic releases");
  if (idx) tcontents.splice(idx + 1, 0, "", "", "", "", "", ""); // Add empty elemets to match number of table columns
  var tableContents = [],
    column = 9;
  while (tcontents.length)
    tableContents.push(tcontents.splice(0, column)); //Convert 1D array to 2D array
  tableContents.pop(); // remove last empty array
  console.log(tableContents);
}

My expected output is:
enter image description here

2

Answers


  1. Chosen as BEST ANSWER

    I figured out a solution as below. I'm using Google Apps Script.

    function test() {
    
      const url = 'https://finviz.com/calendar.ashx';
      const res = UrlFetchApp.fetch(url, { muteHttpExceptions: true }).getContentText();
      const $ = Cheerio.load(res);
      var table = [];
    
      for (var i = 2; i < 15; i += 3) {
        var tableData = $('body > div.content > div > div > table:nth-child(' + i + ')');
        var thead = tableData.find('th').toArray().map(el => $(el).text());
        var tcontents = tableData.find('td').toArray().map(el => $(el).text());
        var idx = tcontents.indexOf('No economic releases');
        if (idx > -1) {
          tcontents.splice(idx + 1, 0, '', '', '', '', '')// Add empty elemets to match number of table columns
        }
        else tcontents.slice(5);
        var tableContents = [], column = 9;
        while (tcontents.length) tableContents.push(tcontents.splice(0, column)); //Convert 1D array to 2D array
    
        var table = [...table, ...[thead], ...tableContents]
      }
    
      console.log(table)
    }
    

  2. I don’t see any sub tables (tables inside of other tables) here, nor do I see 'table[class="calendar"]'. In general, avoid [class=...] syntax since it’s ultra-rigid and fails if there are other classes present, or the classes are in a different order. Prefer table.calendar.

    Try using nesting in your scraping to preserve the structure. while and splice are slidelines, not usually used for much of anything. map is the primary array transformation tool.

    const {writeFile} = require("node:fs/promises");
    const cheerio = require("cheerio"); // ^1.0.0-rc.12
    const Papa = require("papaparse"); // ^5.4.1
    
    const url = "<Your URL>";
    
    fetch(url)
      .then(res => {
        if (!res.ok) {
          throw Error(res.statusText);
        }
    
        return res.text();
      })
      .then(html => {
        const $ = cheerio.load(html);
        const data = [...$(".content table")].flatMap(e => {
          const headers = [...$(e).find("th")].map(e =>
            $(e).text().trim()
          );
          const rows = [...$(e).find("tr:has(td)")].map(e =>
            [...$(e).find("td")].map(e => $(e).text().trim())
          );
          return [headers, ...rows, []];
        });
        return writeFile("out.csv", Papa.unparse(data));
      })
      .catch(err => console.error(err));
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search