creeperjlsy.js 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. /* eslint-disable strict */
  2. // const svgCaptcha = require('svg-captcha');
  3. // const cheerio = require('cheerio');
  4. // const puppeteer = require('puppeteer');
  5. const charset = require('superagent-charset');
  6. const superagent = charset(require('superagent'));
  7. const cheerio = require('cheerio');
  8. const fs = require('fs');
  9. const { CrudService } = require('naf-framework-mongoose/lib/service');
  10. class CreeperjlsyService extends CrudService {
  11. async creeper() {
  12. // 目标链接 吉林省人事考试网第一页
  13. const targetUrl = 'http://www.jlzkb.com/cms/root/ksbmList.vm?dir=L-iAg-ivleaKpeWQjS_kuovkuJrljZXkvY3mi5vogZjogIPor5U&page=1&rows=10';
  14. const columnTitle = '吉林省事业单位招聘';
  15. // 查看是否有这个栏目,没有则创建一个
  16. let column = await this.service.column.model.find({ news_type: '0', title: columnTitle });
  17. if (column.length === 0) {
  18. column = await this.service.column.model.create({ site: '99991', news_type: '0', title: columnTitle, type: '', parent_id: '', parent: '', is_use: '' });
  19. } else {
  20. column = column[0];
  21. }
  22. await this.creeperCreate(targetUrl, column);
  23. }
  24. // 输入路径返回文本
  25. async creeperCreate(targetUrl, column) {
  26. // 目标链接 吉林省人事考试网第一页
  27. // 用来暂时保存解析到的内容和图片地址数据
  28. const hrefOld = targetUrl;
  29. const hrefAdd = 'http://www.jlzkb.com/cms/root/';
  30. const uri = 'http://www.jlzkb.com';
  31. // const imgs = [];
  32. // 创建附件文件夹(暂定直接跳到该网站下载)
  33. // this.mkdir('./attachment');
  34. // 发起请求
  35. superagent.get(targetUrl).charset('gbk').buffer(true)
  36. .end((error, res) => {
  37. if (error) { // 请求出错,打印错误,返回
  38. console.log(error);
  39. return;
  40. }
  41. // cheerio需要先load html
  42. const $ = cheerio.load(res.text);
  43. // 循环列表,获取标题、a标签路径、日期,然后根据a标签路径再次进行爬出内容、保存即可
  44. $('#DivInfoList tr').each((index, element) => {
  45. // 这些数据都是存放在news中的
  46. const title = $(element).find('td a').attr('title');
  47. const thisHref = $(element).find('td a').attr('href');
  48. const time = $(element).find('td[width="12%"]').text();
  49. const publishTime = time.substring(1, 11);
  50. // 这里可以给时间做判断当前日期(如果需要的话可以做为判断条件)
  51. const nowDate = new Date().toLocaleDateString();
  52. if (publishTime !== nowDate) {
  53. // 为undefined时,不需要进行下一步了
  54. if (thisHref !== undefined) {
  55. const thisAllHref = hrefAdd + thisHref;
  56. // 请求内容
  57. superagent.get(thisAllHref).charset('gbk').buffer(true)
  58. .end((error, res) => {
  59. if (error) { // 请求出错,打印错误,返回
  60. console.log(error);
  61. return;
  62. }
  63. const $ = cheerio.load(res.text);
  64. // 获取内容保存
  65. const content = $('#fontzoom').children('p').clone();
  66. content.find(':nth-child(n)').remove();
  67. const attachment = [];
  68. // 每页都查询是否有附件存在,如果有,下载到本地,保存即可
  69. $('#fontzoom p a').each((index, element) => {
  70. const thisHref = $(element).attr('href');
  71. if (thisHref.substring(0, 4) !== 'http') {
  72. const url = uri + thisHref;
  73. const fileName = $(element).text();
  74. // const filepath = this.downloadAttachment(url, fileName);
  75. const file = {
  76. name: fileName,
  77. uri: url,
  78. };
  79. attachment.push(file);
  80. }
  81. });
  82. const news = this.service.news.model.create({
  83. site: column.site,
  84. title,
  85. pic: '',
  86. content,
  87. type: '',
  88. parent_id: column.id,
  89. parent: column.title,
  90. publish_time: publishTime,
  91. attachment,
  92. is_use: '0'
  93. });
  94. });
  95. }
  96. }
  97. });
  98. // 点击下一页
  99. const href = $('#DivPageControl a').eq(2).attr('href');
  100. const hrefNew = hrefAdd + href;
  101. // 第一次路径与第二次路径比较,相同,就不需要调自己了
  102. console.log(hrefNew);
  103. console.log(hrefOld);
  104. if (hrefNew !== hrefOld) {
  105. this.creeperCreate(hrefNew, column);
  106. // over
  107. }
  108. });
  109. }
  110. // 创建目录
  111. async mkdir(_path) {
  112. if (fs.existsSync(_path)) {
  113. console.log(`${_path}目录已存在`);
  114. } else {
  115. fs.mkdir(_path, error => {
  116. if (error) {
  117. return console.log(`创建${_path}目录失败`);
  118. }
  119. console.log(`创建${_path}目录成功`);
  120. });
  121. }
  122. }
  123. // -------------------------------------------------------这里应返回路径供给前台下载(图片存放问题一并解决)-----------------------------------------------------
  124. // 下载爬到的附件
  125. async downloadAttachment(thisHref, fileName) {
  126. // 下载图片存放到指定目录
  127. const stream = fs.createWriteStream('./attachment/' + fileName);
  128. const req = superagent.get(thisHref); // 响应流
  129. req.pipe(stream);
  130. console.log(thisHref);
  131. console.log(fileName);
  132. return '这里返回路径保存即可';
  133. // console.log('开始下载>>>>>>>>>>>>>>');
  134. }
  135. }
  136. module.exports = CreeperjlsyService;