creeperjlsy.js 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. /* eslint-disable strict */
  2. const svgCaptcha = require('svg-captcha');
  3. // const cheerio = require('cheerio');
  4. // const puppeteer = require('puppeteer');
  5. const charset = require('superagent-charset');
  6. const superagent = charset(require('superagent'));
  7. const cheerio = require('cheerio');
  8. const fs = require('fs');
  9. const { CrudService } = require('naf-framework-mongoose/lib/service');
  10. class CreeperjlsyService extends CrudService {
  11. // 产生验证码
  12. async captcha() {
  13. const captcha = svgCaptcha.create({
  14. size: 4, // 大小
  15. fontSize: 50, // 字体大小
  16. width: 100, // 宽
  17. height: 40, // 高
  18. bacground: '#cc9966', // 背景色
  19. });
  20. return captcha;
  21. }
  22. // 发送短信验证码
  23. async sendmessage(ctx, app, randomstr) {
  24. const message = '【吉林省就业中心】您的验证码为:' + randomstr + ',请在30分钟内完成输入,欢迎使用吉林省智慧就业企业服务平台。';
  25. const data = '?Id=300&Name=wwqcgh&Psw=jljyzx-wwqcgh&Message=' + message + '&Phone=' + ctx.query.mobile + '&Timestamp=0';
  26. // const data = '?Id=300&Name=wwqcgh&Psw=jljyzx-wwqcgh&Message=' + message + '&Phone=' + phone + '&Timestamp=0';
  27. const path = ctx.app.config.messageDir + data;
  28. const result = await app.curl(path, {
  29. method: 'GET',
  30. dataType: 'text/xml',
  31. });
  32. return result;
  33. }
  34. async creeper() {
  35. // 目标链接 吉林省人事考试网第一页
  36. const targetUrl = 'http://www.jlzkb.com/cms/root/ksbmList.vm?dir=L-iAg-ivleaKpeWQjS_kuovkuJrljZXkvY3mi5vogZjogIPor5U&page=1&rows=10';
  37. const columnTitle = '吉林省事业单位招聘';
  38. // 查看是否有这个栏目,没有则创建一个
  39. let column = await this.service.column.model.find({ news_type: '0', title: columnTitle });
  40. if (column.length === 0) {
  41. column = await this.service.column.model.create({ site: '99991', news_type: '0', title: columnTitle, type: '', parent_id: '', parent: '', is_use: '' });
  42. } else {
  43. column = column[0];
  44. }
  45. await this.creeperCreate(targetUrl, column);
  46. }
  47. // 输入路径返回文本
  48. async creeperCreate(targetUrl, column) {
  49. // 目标链接 吉林省人事考试网第一页
  50. // 用来暂时保存解析到的内容和图片地址数据
  51. const hrefOld = targetUrl;
  52. const hrefAdd = 'http://www.jlzkb.com/cms/root/';
  53. const uri = 'http://www.jlzkb.com';
  54. // const imgs = [];
  55. // 创建附件文件夹(暂定直接跳到该网站下载)
  56. // this.mkdir('./attachment');
  57. // 发起请求
  58. superagent.get(targetUrl).charset('utf-8').buffer(true)
  59. .end((error, res) => {
  60. if (error) { // 请求出错,打印错误,返回
  61. console.log(error);
  62. return;
  63. }
  64. // cheerio需要先load html
  65. const $ = cheerio.load(res.text);
  66. // 循环列表,获取标题、a标签路径、日期,然后根据a标签路径再次进行爬出内容、保存即可
  67. $('#DivInfoList tr').each((index, element) => {
  68. // 这些数据都是存放在news中的
  69. const title = $(element).find('td a').attr('title');
  70. const thisHref = $(element).find('td a').attr('href');
  71. const time = $(element).find('td[width="12%"]').text();
  72. const publishTime = time.substring(1, 11);
  73. // 这里可以给时间做判断当前日期(如果需要的话可以做为判断条件)
  74. const nowDate = new Date().toLocaleDateString();
  75. if (publishTime !== nowDate) {
  76. // 为undefined时,不需要进行下一步了
  77. if (thisHref !== undefined) {
  78. const thisAllHref = hrefAdd + thisHref;
  79. // 请求内容
  80. superagent.get(thisAllHref).charset('utf-8').buffer(true)
  81. .end((error, res) => {
  82. if (error) { // 请求出错,打印错误,返回
  83. console.log(error);
  84. return;
  85. }
  86. const $ = cheerio.load(res.text);
  87. // 获取内容保存
  88. const content = $('#fontzoom').children('p').clone();
  89. content.find(':nth-child(n)').remove();
  90. const attachment = [];
  91. // 每页都查询是否有附件存在,如果有,下载到本地,保存即可
  92. $('#fontzoom p a').each((index, element) => {
  93. const thisHref = $(element).attr('href');
  94. if (thisHref.substring(0, 4) !== 'http') {
  95. const url = uri + thisHref;
  96. const fileName = $(element).text();
  97. // const filepath = this.downloadAttachment(url, fileName);
  98. const file = {
  99. name: fileName,
  100. uri: url,
  101. };
  102. attachment.push(file);
  103. }
  104. });
  105. const news = this.service.news.model.create({
  106. site: column.site,
  107. title,
  108. pic: '',
  109. content,
  110. type: '',
  111. parent_id: column.id,
  112. parent: column.title,
  113. publish_time: publishTime,
  114. attachment,
  115. is_use: '0'
  116. });
  117. });
  118. }
  119. }
  120. });
  121. // 点击下一页
  122. const href = $('#DivPageControl a').eq(2).attr('href');
  123. const hrefNew = hrefAdd + href;
  124. // 第一次路径与第二次路径比较,相同,就不需要调自己了
  125. console.log(hrefNew);
  126. console.log(hrefOld);
  127. if (hrefNew !== hrefOld) {
  128. // this.creeperCreate(hrefNew, column);
  129. // over
  130. }
  131. });
  132. }
  133. // 创建目录
  134. async mkdir(_path) {
  135. if (fs.existsSync(_path)) {
  136. console.log(`${_path}目录已存在`);
  137. } else {
  138. fs.mkdir(_path, error => {
  139. if (error) {
  140. return console.log(`创建${_path}目录失败`);
  141. }
  142. console.log(`创建${_path}目录成功`);
  143. });
  144. }
  145. }
  146. // -------------------------------------------------------这里应返回路径供给前台下载(图片存放问题一并解决)-----------------------------------------------------
  147. // 下载爬到的附件
  148. async downloadAttachment(thisHref, fileName) {
  149. // 下载图片存放到指定目录
  150. const stream = fs.createWriteStream('./attachment/' + fileName);
  151. const req = superagent.get(thisHref); // 响应流
  152. req.pipe(stream);
  153. console.log(thisHref);
  154. console.log(fileName);
  155. return '这里返回路径保存即可';
  156. // console.log('开始下载>>>>>>>>>>>>>>');
  157. }
  158. }
  159. module.exports = CreeperjlsyService;