smart-center
/
serve-cms


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
							/* eslint-disable strict */
// const svgCaptcha = require('svg-captcha');
// const cheerio = require('cheerio');
// const puppeteer = require('puppeteer');
const charset = require('superagent-charset');
const superagent = charset(require('superagent'));
const cheerio = require('cheerio');
const fs = require('fs');
const { CrudService } = require('naf-framework-mongoose/lib/service');
class CreeperjlsyService extends CrudService {

  async creeper() {
    // 目标链接 吉林省人事考试网第一页
    const targetUrl = 'http://www.jlzkb.com/cms/root/ksbmList.vm?dir=L-iAg-ivleaKpeWQjS_kuovkuJrljZXkvY3mi5vogZjogIPor5U&page=1&rows=10';
    const columnTitle = '吉林省事业单位招聘';
    // 查看是否有这个栏目，没有则创建一个
    let column = await this.service.column.model.find({ news_type: '0', title: columnTitle });
    if (column.length === 0) {
      column = await this.service.column.model.create({ site: '99991', news_type: '0', title: columnTitle, type: '', parent_id: '', parent: '', is_use: '' });
    } else {
      column = column[0];
    }
    await this.creeperCreate(targetUrl, column);
  }


  // 输入路径返回文本
  async creeperCreate(targetUrl, column) {
    // 目标链接 吉林省人事考试网第一页
    // 用来暂时保存解析到的内容和图片地址数据
    const hrefOld = targetUrl;
    const hrefAdd = 'http://www.jlzkb.com/cms/root/';
    const uri = 'http://www.jlzkb.com';
    // const imgs = [];
    // 创建附件文件夹(暂定直接跳到该网站下载)
    // this.mkdir('./attachment');

    // 发起请求
    superagent.get(targetUrl).charset('gbk').buffer(true)
      .end((error, res) => {
        if (error) { // 请求出错，打印错误，返回
          console.log(error);
          return;
        }
        // cheerio需要先load html
        const $ = cheerio.load(res.text);
        // 循环列表，获取标题、a标签路径、日期，然后根据a标签路径再次进行爬出内容、保存即可
        $('#DivInfoList tr').each((index, element) => {
        // 这些数据都是存放在news中的
          const title = $(element).find('td a').attr('title');
          const thisHref = $(element).find('td a').attr('href');
          const time = $(element).find('td[width="12%"]').text();
          const publishTime = time.substring(1, 11);
          // 这里可以给时间做判断当前日期（如果需要的话可以做为判断条件）
          const nowDate = new Date().toLocaleDateString();
          if (publishTime !== nowDate) {
            // 为undefined时，不需要进行下一步了
            if (thisHref !== undefined) {
              const thisAllHref = hrefAdd + thisHref;
              // 请求内容
              superagent.get(thisAllHref).charset('gbk').buffer(true)
                .end((error, res) => {
                  if (error) { // 请求出错，打印错误，返回
                    console.log(error);
                    return;
                  }
                  const $ = cheerio.load(res.text);
                  // 获取内容保存
                  const content = $('#fontzoom').children('p').clone();
                  content.find(':nth-child(n)').remove();
                  const attachment = [];
                  // 每页都查询是否有附件存在，如果有，下载到本地，保存即可
                  $('#fontzoom p a').each((index, element) => {
                    const thisHref = $(element).attr('href');
                    if (thisHref.substring(0, 4) !== 'http') {
                      const url = uri + thisHref;
                      const fileName = $(element).text();
                      // const filepath = this.downloadAttachment(url, fileName);
                      const file = {
                        name: fileName,
                        uri: url,
                      };
                      attachment.push(file);
                    }
                  });
                  const news = this.service.news.model.create({
                    site: column.site,
                    title,
                    pic: '',
                    content,
                    type: '',
                    parent_id: column.id,
                    parent: column.title,
                    publish_time: publishTime,
                    attachment,
                    is_use: '0'
                  });
                });
            }
          }
        });
        // 点击下一页
        const href = $('#DivPageControl a').eq(2).attr('href');
        const hrefNew = hrefAdd + href;
        // 第一次路径与第二次路径比较，相同，就不需要调自己了
        console.log(hrefNew);
        console.log(hrefOld);
        if (hrefNew !== hrefOld) {
          this.creeperCreate(hrefNew, column);
        // over
        }
      });
  }
  // 创建目录

  async mkdir(_path) {
    if (fs.existsSync(_path)) {
      console.log(`${_path}目录已存在`);
    } else {
      fs.mkdir(_path, error => {
        if (error) {
          return console.log(`创建${_path}目录失败`);
        }
        console.log(`创建${_path}目录成功`);
      });
    }
  }
  // -------------------------------------------------------这里应返回路径供给前台下载（图片存放问题一并解决）-----------------------------------------------------
  // 下载爬到的附件
  async downloadAttachment(thisHref, fileName) {
    // 下载图片存放到指定目录
    const stream = fs.createWriteStream('./attachment/' + fileName);
    const req = superagent.get(thisHref); // 响应流
    req.pipe(stream);
    console.log(thisHref);
    console.log(fileName);
    return '这里返回路径保存即可';
    // console.log('开始下载>>>>>>>>>>>>>>');
  }

}
module.exports = CreeperjlsyService;