123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- /* eslint-disable strict */
- const Service = require('egg').Service;
- const svgCaptcha = require('svg-captcha');
- // const cheerio = require('cheerio');
- // const puppeteer = require('puppeteer');
- const charset = require('superagent-charset');
- const superagent = charset(require('superagent'));
- const cheerio = require('cheerio');
- const fs = require('fs');
- const { CrudService } = require('naf-framework-mongoose/lib/service');
- class ToolsService extends CrudService {
- // 产生验证码
- async captcha() {
- const captcha = svgCaptcha.create({
- size: 4, // 大小
- fontSize: 50, // 字体大小
- width: 100, // 宽
- height: 40, // 高
- bacground: '#cc9966', // 背景色
- });
- return captcha;
- }
- // 发送短信验证码
- async sendmessage(ctx, app, randomstr) {
- const message = '【吉林省就业中心】您的验证码为:' + randomstr + ',请在30分钟内完成输入,欢迎使用吉林省智慧就业企业服务平台。';
- const data = '?Id=300&Name=wwqcgh&Psw=jljyzx-wwqcgh&Message=' + message + '&Phone=' + ctx.query.mobile + '&Timestamp=0';
- // const data = '?Id=300&Name=wwqcgh&Psw=jljyzx-wwqcgh&Message=' + message + '&Phone=' + phone + '&Timestamp=0';
- const path = ctx.app.config.messageDir + data;
- const result = await app.curl(path, {
- method: 'GET',
- dataType: 'text/xml',
- });
- return result;
- }
- async creeper() {
- // 目标链接 吉林省人事考试网第一页
- const targetUrl = 'http://www.jlzkb.com/cms/root/ksbmList.vm?dir=L-iAg-ivleaKpeWQjS_kuovkuJrljZXkvY3mi5vogZjogIPor5U&page=1&rows=10';
- const columnTitle = '吉林省事业单位招聘';
- // 查看是否有这个栏目,没有则创建一个
- let column = await this.service.column.model.find({ news_type: '0', title: columnTitle });
- if (column.length === 0) {
- column = await this.service.column.model.create({ site: '99991', news_type: '0', title: columnTitle, type: '', parent_id: '', parent: '', is_use: '' });
- } else {
- column = column[0];
- }
- await this.creeperCreate(targetUrl, column);
- }
- // 输入路径返回文本
- async creeperCreate(targetUrl, column) {
- // 目标链接 吉林省人事考试网第一页
- // 用来暂时保存解析到的内容和图片地址数据
- const hrefOld = targetUrl;
- const hrefAdd = 'http://www.jlzkb.com/cms/root/';
- const uri = 'http://www.jlzkb.com';
- // const imgs = [];
- // 创建附件文件夹(暂定直接跳到该网站下载)
- // this.mkdir('./attachment');
- // 发起请求
- superagent.get(targetUrl).charset('utf-8').buffer(true)
- .end((error, res) => {
- if (error) { // 请求出错,打印错误,返回
- console.log(error);
- return;
- }
- // cheerio需要先load html
- const $ = cheerio.load(res.text);
- // 循环列表,获取标题、a标签路径、日期,然后根据a标签路径再次进行爬出内容、保存即可
- $('#DivInfoList tr').each((index, element) => {
- // 这些数据都是存放在news中的
- const title = $(element).find('td a').attr('title');
- const thisHref = $(element).find('td a').attr('href');
- const time = $(element).find('td[width="12%"]').text();
- const publishTime = time.substring(1, 11);
- // 这里可以给时间做判断当前日期(如果需要的话可以做为判断条件)
- const nowDate = new Date().toLocaleDateString();
- if (publishTime !== nowDate) {
- // 为undefined时,不需要进行下一步了
- if (thisHref !== undefined) {
- const thisAllHref = hrefAdd + thisHref;
- // 请求内容
- superagent.get(thisAllHref).charset('utf-8').buffer(true)
- .end((error, res) => {
- if (error) { // 请求出错,打印错误,返回
- console.log(error);
- return;
- }
- const $ = cheerio.load(res.text);
- // 获取内容保存
- const content = $('#fontzoom').children('p').clone();
- content.find(':nth-child(n)').remove();
- const attachment = [];
- // 每页都查询是否有附件存在,如果有,下载到本地,保存即可
- $('#fontzoom p a').each((index, element) => {
- const thisHref = $(element).attr('href');
- if (thisHref.substring(0, 4) !== 'http') {
- const url = uri + thisHref;
- const fileName = $(element).text();
- // const filepath = this.downloadAttachment(url, fileName);
- const file = {
- name: fileName,
- uri: url,
- };
- attachment.push(file);
- }
- });
- const news = this.service.news.model.create({
- site: column.site,
- title,
- pic: '',
- content,
- type: '',
- parent_id: column.id,
- parent: column.title,
- publish_time: publishTime,
- attachment,
- is_use: '0'
- });
- });
- }
- }
- });
- // 点击下一页
- const href = $('#DivPageControl a').eq(2).attr('href');
- const hrefNew = hrefAdd + href;
- // 第一次路径与第二次路径比较,相同,就不需要调自己了
- console.log(hrefNew);
- console.log(hrefOld);
- if (hrefNew !== hrefOld) {
- // this.creeperCreate(hrefNew, column);
- // over
- }
- });
- }
- // 创建目录
- async mkdir(_path) {
- if (fs.existsSync(_path)) {
- console.log(`${_path}目录已存在`);
- } else {
- fs.mkdir(_path, error => {
- if (error) {
- return console.log(`创建${_path}目录失败`);
- }
- console.log(`创建${_path}目录成功`);
- });
- }
- }
- // -------------------------------------------------------这里应返回路径供给前台下载(图片存放问题一并解决)-----------------------------------------------------
- // 下载爬到的附件
- async downloadAttachment(thisHref, fileName) {
- // 下载图片存放到指定目录
- const stream = fs.createWriteStream('./attachment/' + fileName);
- const req = superagent.get(thisHref); // 响应流
- req.pipe(stream);
- console.log(thisHref);
- console.log(fileName);
- return '这里返回路径保存即可';
- // console.log('开始下载>>>>>>>>>>>>>>');
- }
- }
- module.exports = ToolsService;
|