使用节点和cheerio.js 抓取具有rowspan 属性的HTML 表格

问题描述

我正在尝试从维基百科中抓取 HTML 表格以获取数据,但是由于 rowspan 属性,我在遍历表格时遇到了很多麻烦。我正在使用cheerio 包,并且在cheerio 返回的jQuery 对象数量中迷失了方向。任何帮助将不胜感激,我几天来一直在反对这个。您在下面看到的只是我尝试使用的代码的一小部分。谢谢

表格网址 = "https://en.wikipedia.org/wiki/Stephen_King_bibliography"


const axios = require("axios");
const $ = require("cheerio");

const WIKI_URL = "https://en.wikipedia.org/wiki/Stephen_King_bibliography";

const getBooks = async (url) => {
  const scrapedBooks = [];
  try {
    const res = await axios.get(url);
    const htmlParse = $(
      "#mw-content-text > div.mw-parser-output > table:nth-child(6) > tbody > tr",res.data
    );
    // let filtData = [];
    // for (let i = 0; i < htmlParse.length; i++) {
    //   filtData.push(
    //     htmlParse[i].children.filter((child) => child.data !== "\n")
    //   );
    // }
    // htmlParse[i].children[5].attribs["rowspan"]

//Getting the Row headers
    let headers = [];
    htmlParse.each((index,el) => {
      headers.push(
        $(el)
          .find("th")
          .text()
          .split("\n")
          .filter((item) => item.length)
      );
    });
    headers = headers[0];

    let data = [];

    htmlParse.each((index,el) => {
      let item = $(el).find("td");
      item.attr("rowspan") ? data.push(item.get()) : data.push("NOT FOUND");
    });

    console.log(data);

  } catch (err) {
    console.log(err);
  }
};

getBooks(WIKI_URL);


解决方法

以前从未尝试过网络抓取,但我一直想进入它,所以我试了一下这个问题。

据我所知,您的问题是 rowspan 丢弃了表格中每行的单元格数,这导致某些行的数据项少于一致的 6 项。

由于我采用了全新的方法,代码并不相同,但它确实适用于您的 url,使用相同的包并解决您的特定问题。有关工作版本,请参阅下面的代码。我添加了自己的评论,以帮助您了解正在发生的事情。

const cheerio = require('cheerio');
// parses HTML
var $ = cheerio.load(res.data);
// targets the specific with selectors
var html_table = $('#mw-content-text > div.mw-parser-output > table:nth-child(6)');
// gets table header titles; loops through all th data,and pulls an array of the values
var table_header = html_table.find('th').map(function() {return $(this).text().trim();}).toArray();
// gets all rowspans inside the table; loops through all tr rows
var rowspans = html_table.find('tbody tr').map(function(tr_index) {
  // gets all rowspans for the current row; loops through all td cells of the current tr row
  var tr_rowspans = $(this).find('td').map(function(td_index) {
    // shotern the reference to the rowspan value of the current td cell since it's used more than once
    var rowspan_count = $(this).attr('rowspan');
    // build an object with the index of the current tr,td,the rowspan value,and the the td value
    return {'tr_index':tr_index,'td_index':rowspan_count ? td_index : 0,'count':Number(rowspan_count) || 0,'value':$(this).text().trim()};
  }).toArray().filter(function(item) {return item.count;});
  // the filter above ^ removes undefined items form the array
  // returns the rowspans for the current row
  return tr_rowspans;
}).toArray();
// gets table cell values; loops through all tr rows
var table_data = html_table.find('tbody tr').map(function(tr_index) {
  // gets the cells value for the row; loops through each cell and returns an array of values
  // note: nothing special happens to the cells at this point
  var cells = $(this).find('td').map(function(td_index) {return $(this).text().trim();}).toArray();
  // adds missing cells to each row of data; loops through each rowspan collected
  rowspans.forEach(function(rowspan) {
    // defines the min and max rows; uses tr index to get the first index,and tr index + rowspan value to get the last index
    var span = {'min':rowspan.tr_index,'max':rowspan.tr_index + rowspan.count};
    // if the current row is within the min and max
    if (tr_index > span.min && tr_index < span.max) {
      // add an array element where the missing cell data should be and add the cell value at the same time
      cells.splice(rowspan.td_index,rowspan.value);
    }
  });
  // returns an array of the cell data generated
  return [cells];
}).toArray();
// output the table headers
console.log('table_header',table_header);
// output the table data
console.log('table_data',table_data);