NodeJS爬虫摸索教程

20 Jan 2017

写在前面

听过好多次爬虫，却一直没机会学习一下，学了NodeJS之后，知道NodeJS可以写爬虫，觉得应该挺好玩的，于是入门摸索了一下如何用NodeJS写爬虫，然后了解了几种情况之后，写这篇来总结一下。

NodeJS实现爬虫有很多的依赖可以选择，但是一般有一个比较常见的搭配,后面也出了一些依赖是提供了比较完整的功能，比如：文章后面介绍的node-crawler。

这个爬虫实现的功能：获取到Github上trending中关于JavaScript每周排行版在前25个的仓库。

安装依赖

superagent：是个http方面的库，可以发起http请求
cheerio：解析http返回的html内容，可以理解为一个Node.js版的 jquery，使用方式跟jquery相同。
eventproxy：利用事件机制解决回调函数深度嵌套的问题
async：多线程并发控制

项目路径下，安装上面这四个依赖：

 npm install superagent
 npm install cheerio
 npm install eventproxy
 npm install async

编码过程

可以通过直接访问查看25个repo，现在我们用上面安装的依赖库，来获得每个repo的地址，并且实现并发访问。

首先利用 superagent 来请求请求页面，并借助 cheerio 这个库来分析页面结构，或许所需要的数据，代码如下：

var superagent = require('superagent')
var cheerio = require('cheerio')
var url = require('url')
var targetUrl = 'https://github.com/trending/javascript?since=weekly'

superagent.get(targetUrl)
.end(function (err, res) {
  if(err) {
    return console.error(err)
  }
  console.log('爬虫开始')
  var $ = cheerio.load(res.text)  // 利用cheerio来解析页面

  var repoUrls = []  // 保存仓库的url
  var titleArray = []

  console.log($('.repo-list .mb-1 a').length) // 25
  $('.repo-list .mb-1 a').each(function (index, element) {
    var $element = $(element)
    
    // 获取每个仓库的url
    var href = url.resolve('https://github.com', $element.attr('href'))
    repoUrls.push(href)
    
    // 获取每个仓库名
    var $title = $element.contents().last()[0].data
    $title = $title.substr(0, $title.length - 1)
    titleArray.push($title)
  })
  console.log(repoUrls)
  console.log(titleArray)
})

通过以上代码可以获得25个仓库的url和仓库名，如果继续只是单纯使用superagent来发送请求，那么要连续发送25次请求，而且如果前后数据是有利用或者依赖关系的话，就需要等待上一次请求的结束，才可以发起下一次请求，这样子代码写起来就类似于这样：

superagent.get(repoUrls[0])
  .end(function (err, res) {
    console.log('fetch ' + repoUrls[0] + ' successful');
    
    // 获取到数据之后发起下一次请求
    superagent.get(repoUrls[0])
      .end(function (err, res) {
      console.log('fetch ' + repoUrls[0] + ' successful');
      
        // 获取到数据之后发起下一次请求
        superagent.get(repoUrls[0])
          .end(function (err, res) {
          console.log('fetch ' + repoUrls[0] + ' successful');
        })
    })
  })

这样子就形成所谓的回调函数深度嵌套的问题，于是可以借助 eventproxy 这个库，API很简单,通过下面比较优雅的代码就可以实现对25个仓库发送请求：

// ...
var eventproxy = require('eventproxy')
var ep = new eventproxy()

superagent.get(targetUrl)
.end(function (err, res) {
  //获得25个仓库的url和仓库名
  // ...
  console.log('爬虫开始')
  ep.after('accessRepo', repoUrls.length, function(repos) {
    var oneRepo = repos.map(function (repo) {
      var repoUrl = repo[0]
      var repoHTMl = repo[1]
      var $ = cheerio.load(repoHTMl)

      var httpGitUrl = $('.https-clone-options .js-url-field').attr('value')
      console.log(httpGitUrl)
      return ({
        url: repoUrl,
        httpGitUrl: httpGitUrl
      })
    })
    console.log(oneRepo)  // 获得25个仓库的相应http下载路径
  })

  repoUrls.forEach(function (repoUrl) {
    superagent.get(repoUrl)
      .end(function (err, res) {
        console.log('fetch ' + repoUrl + ' successful');
        ep.emit('accessRepo', [repoUrl, res.text]);
      });
  });
})

但是利用上面代码依次请求25个 repoURL 需要等待很长时间，如果你没有翻墙的话，那就更加不用说了，说不定你可以先看个电影再回来看看是否执行完，所以当请求量多的时候，且希望可以并行请求的话，可以借助 async 这个库，下面我们借助它来实现并发请求25个仓库：

// ...
var async = require('async')

superagent.get(targetUrl)
.end(function (err, res) {
  // ...
  
  // 利用async实现并发数为5的请求处理过程
  concurrencyCount = 0  // 当前并发数记录
  // 对每个repo的逻辑处理函数
  var fetchUrl = function (repoUrl, callback) {
    concurrencyCount++
    console.log('现在的并发数是', concurrencyCount, '，正在抓取的是', repoUrl)
    superagent.get(repoUrl)
      .end(function(err, res){
        var $ = cheerio.load(res.text);             
        // 对页面内容进行解析
        var httpGitUrl = $('.https-clone-options .js-url-field').attr('value')
        console.log(httpGitUrl)
        return ({
          url: repoUrl,
          httpGitUrl: httpGitUrl
        })
        concurrencyCount--;
        callback(null, repoUrl);
      });
  }

  async.mapLimit(repoUrls, 5,
    function (repoUrl, callback) {
      // 对每个URL进行相应处理
      fetchUrl(repoUrl, callback)
    }, function (err, result) {
      console.log('final:')
      console.log(result)
    }
  )
})

于是借助 async 可以实现并发，也可以避免函数嵌套的问题，所以这样就实现了这个爬虫的功能。完整代码如下：

var superagent = require('superagent')
var cheerio = require('cheerio')
var url = require('url')
var targetUrl = 'https://github.com/trending/javascript?since=weekly'

var eventproxy = require('eventproxy')
var ep = new eventproxy()

var async = require('async')

superagent.get(targetUrl)
.end(function (err, res) {
  if(err) {
    return console.error(err)
  }
  console.log('爬虫开始')
  var $ = cheerio.load(res.text)  // 利用cheerio来解析页面

  var repoUrls = []  // 保存仓库的url
  var titleArray = []

  console.log($('.repo-list .mb-1 a').length) // 25
  $('.repo-list .mb-1 a').each(function (index, element) {
    var $element = $(element)

    // 获取每个仓库的url
    var href = url.resolve('https://github.com', $element.attr('href'))
    repoUrls.push(href)

    // 获取每个仓库名
    var $title = $element.contents().last()[0].data
    $title = $title.substr(0, $title.length - 1)
    titleArray.push($title)
  })
  repoUrls = repoUrls.slice(0, 2)

  // ep.after('accessRepo', repoUrls.length, function(repos) {
  //   var oneRepo = repos.map(function (repo) {
  //     var repoUrl = repo[0]
  //     var repoHTMl = repo[1]
  //     var $ = cheerio.load(repoHTMl)

  //     var httpGitUrl = $('.https-clone-options .js-url-field').attr('value')
  //     console.log(httpGitUrl)
  //     return ({
  //       url: repoUrl,
  //       httpGitUrl: httpGitUrl
  //     })
  //   })
  //   console.log(oneRepo)  // 获得25个仓库的相应http下载路径
  // })

  // repoUrls.forEach(function (repoUrl) {
  //   superagent.get(repoUrl)
  //     .end(function (err, res) {
  //       console.log('fetch ' + repoUrl + ' successful');
  //       ep.emit('accessRepo', [repoUrl, res.text]);
  //     });
  // })
  
  // async
  // 利用async实现并发数为5的请求处理过程
  concurrencyCount = 0  // 当前并发数记录
  // 对每个repo的逻辑处理函数
  var fetchUrl = function (repoUrl, callback) {
    concurrencyCount++
    console.log('现在的并发数是', concurrencyCount, '，正在抓取的是', repoUrl)
    superagent.get(repoUrl)
      .end(function(err, res){
        var $ = cheerio.load(res.text);             
        // 对页面内容进行解析
        var httpGitUrl = $('.https-clone-options .js-url-field').attr('value')
        console.log(httpGitUrl)
        return ({
          url: repoUrl,
          httpGitUrl: httpGitUrl
        })
        concurrencyCount--;
        callback(null, repoUrl);
      });
  }

  async.mapLimit(repoUrls, 5,
    function (repoUrl, callback) {
      // 对每个URL进行相应处理
      fetchUrl(repoUrl, callback)
    }, function (err, result) {
      console.log('final:')
      console.log(result)
    }
  )
})

一个就好

继续摸索发现了一个新大陆 node-crawler , 这个库集成了上面三四个依赖库的功能，对于简单爬虫，应该是够用的。

例如，用下面代码也可以实现前面所说的爬虫：

var Crawler = require("crawler")
var url = require('url')
var targetUrl = 'https://github.com/trending/javascript?since=weekly'

var repoUrls = []
var titleArray = []
var httpGitUrlArray = []

var c = new Crawler({
  maxConnections: 5,
  callback: function (error, res, done) {
    if(error){
      console.error(error);
    }else{
      var $ = res.$

      // 仓库页面的逻辑处理
      var httpGitUrl = $('.https-clone-options .js-url-field').attr('value')
      console.log(httpGitUrl)

      // 保存到数组中
      httpGitUrlArray.push({
        url: this.uri,
        httpGitUrl: httpGitUrl
      })
    }
    console.log(httpGitUrlArray)
    done();
  }
});

c.queue({
  uri: targetUrl,
  callback: function (error, res, done) {
    if(error){
      console.error(error);
    }else{
      var $ = res.$;
      console.log($('.repo-list .mb-1 a').length) // 25
      $('.repo-list .mb-1 a').each(function (index, element) {
        var $element = $(element)

        // 获取每个仓库的url
        var href = url.resolve('https://github.com', $element.attr('href'))
        repoUrls.push(href)

        // 获取每个仓库名
        var $title = $element.contents().last()[0].data
        $title = $title.substr(0, $title.length - 1)
        titleArray.push($title)
      })
    }
    
    // 直接抓取每个仓库页面
    c.queue(repoUrls)
    done();
  }
})

补充

关于`superagent`请求传参

如果抓取页面时，需要传参数的话，则利用 superagent 的 post 方法可以实现：

superagent
    .post(url)
    .send({ 
        // 参数
    .end(function(err, res){
        // ...
    })

关于并发请求

上面用到的async以及node-crawler可以并发去请求我们所需要的页面，需要设置并发量这个属性，一般根据实际情况来定，但是我们从返回的结果数组是无序的可以发现，并发确实提升了访问效率，因此并发是必须的哈哈：>

欲了解更多关于上面几个依赖库的请参考相关官方API： superagent cheerio eventproxy async node-crawler

学习参考链接： Node.js 实战心得的爬虫部分