瀏覽代碼

爬虫增加教师

nihao 5 年之前
父節點
當前提交
e4c8828577

+ 2 - 2
app/controller/.column.js

@@ -4,14 +4,14 @@ module.exports = {
     "parameters": {
       "query": ["!site"],
     },
-    "requestBody": ["title", "type", "parent_id", "news_type","parent", "is_use"],
+    "requestBody": ["title", "type", "parent_id", "news_type", "parent", "is_use", "url", "content_id","parent_type"],
   },
   // 修改栏目
   "update": {
     "parameters": {
       "params": ["!id"],
     },
-    "requestBody": ["title", "type", "parent_id", "news_type","parent", "is_use"],
+    "requestBody": ["title", "type", "parent_id", "news_type", "parent", "is_use", "url", "content_id", "parent_type"],
     // "options": {
     //   "projection": "+name",
     // },

+ 2 - 2
app/controller/.menu.js

@@ -4,14 +4,14 @@ module.exports = {
     "parameters": {
       "query": ["!site"],
     },
-    "requestBody": ["title", "type", "is_use"],
+    "requestBody": ["title", "type", "is_use", "url","content_id"],
   },
   // 修改菜单
   "update": {
     "parameters": {
       "params": ["!id"],
     },
-    "requestBody": ["title", "type","is_use"],
+    "requestBody": ["title", "type", "is_use", "url", "content_id"],
     // "options": {
     //   "projection": "+name",
     // },

+ 3 - 13
app/controller/verify.js

@@ -54,19 +54,9 @@ class VerifyController extends Controller {
     }
   }
   async creeper() {
-    this.service.creeperjlsy.creeper();
-    // const { ctx, app } = this;
-    // // 目标链接 吉林省人事考试网第一页
-    // const targetUrl = 'http://www.jlzkb.com/cms/root/ksbmList.vm?dir=L-iAg-ivleaKpeWQjS_kuovkuJrljZXkvY3mi5vogZjogIPor5U&page=1&rows=8';
-    // const columnTitle = '吉林省事业单位招聘';
-    // // 查看是否有这个栏目,没有则创建一个
-    // let column = await this.service.column.model.find({ news_type: '0', title: columnTitle });
-    // if (column.length === 0) {
-    //   column = await this.service.column.model.create({ site: '99991', news_type: '0', title: columnTitle, type: '', parent_id: '', parent: '', is_use: '' });
-    // }
-    // await this.service.tool.creeper(targetUrl, column);
-    // // console.log(column.length);
-    // ctx.body = { errcode: 1, errmsg: 'error', data: column };
+
+    // this.service.creeperjlsy.creeper();// 吉林省事业招聘
+    this.service.creeperjljs.creeper();// 吉林省教师招聘
   }
 }
 

+ 3 - 0
app/model/column.js

@@ -30,6 +30,9 @@ const SchemaDefine = {
   parent: { type: String, required: false, maxLength: 100 }, // 所属名称
   is_use: { type: String, required: false, maxLength: 5 }, // 是否使用,0=>使用中;1=>已禁止
   news_type: { type: String, default: 1, maxLength: 5 }, // 0抓取,1正常输入
+  url: String, // 外部链接
+  content_id: { type: String, required: false, maxLength: 64 }, // 信息Id
+  parent_type: { type: String, required: false, maxLength: 5 }, // 父亲类型
   meta: {
     createdBy: String, // 创建用户
     updatedBy: String, // 修改用户

+ 2 - 0
app/model/menu.js

@@ -17,6 +17,8 @@ const SchemaDefine = {
   title: { type: String, required: false, maxLength: 100 }, // 菜单名称
   type: { type: String, required: false, maxLength: 5 }, // :类型,'content'=>信息内容类型/'column'=>'栏目',
   is_use: { type: String, required: false, maxLength: 5 }, // 是否使用,0=>使用中;1=>已禁止
+  url: String, // 外部链接
+  content_id: { type: String, required: false, maxLength: 64 }, // 信息Id
   meta: {
     createdBy: String, // 创建用户
     updatedBy: String, // 修改用户

+ 9 - 3
app/service/column.js

@@ -12,7 +12,7 @@ class ColumnService extends CrudService {
     this.model = this.ctx.model.Column;
   }
 
-  async create({ site }, { title, type, parent_id, parent, is_use, news_type }) {
+  async create({ site }, { title, type, parent_id, parent, is_use, news_type, url, content_id, parent_type}) {
     // 检查数据
     assert(_.isString(site), 'site不能为空');
     assert(!title || _.isString(title), 'title必须为字符串');
@@ -21,13 +21,16 @@ class ColumnService extends CrudService {
     assert(!parent || _.isString(parent), 'parent必须为字符串');
     assert(!is_use || _.isString(is_use), 'is_use必须为字符串');
     assert(!news_type || _.isString(news_type), 'news_type必须为字符串');
+    assert(!url || _.isString(url), 'url必须为字符串');
+    assert(!content_id || _.isString(content_id), 'content_id必须为字符串');
+    assert(!parent_type || _.isString(parent_type), 'parent_type必须为字符串');
     // TODO: 检查用户信息
     const userid = this.ctx.userid;
     if (!_.isString(userid)) throw new BusinessError(ErrorCode.NOT_LOGIN);
 
     // TODO:保存数据
     const data = {
-      site, title, type, parent_id, parent, is_use, news_type,
+      site, title, type, parent_id, parent, is_use, news_type, url, content_id, parent_type,
       meta: { createdBy: userid },
     };
     
@@ -37,7 +40,7 @@ class ColumnService extends CrudService {
 
   async update({ id }, payload) {
     // 检查数据
-    const { title, type, parent_id, parent, is_use, news_type } = payload;
+    const { title, type, parent_id, parent, is_use, news_type, url, content_id, parent_type} = payload;
     assert(id, 'id不能为空');
     assert(!title || _.isString(title), 'title必须为字符串');
     assert(!type || _.isString(type), 'type必须为字符串');
@@ -45,6 +48,9 @@ class ColumnService extends CrudService {
     assert(!parent || _.isString(parent), 'parent必须为字符串');
     assert(!is_use || _.isString(is_use), 'is_use必须为字符串');
     assert(!news_type || _.isString(news_type), 'news_type必须为字符串');
+    assert(!url || _.isString(url), 'url必须为字符串');
+    assert(!content_id || _.isString(content_id), 'content_id必须为字符串');
+    assert(!parent_type || _.isString(parent_type), 'parent_type必须为字符串');
     // TODO: 检查用户信息
     const userid = this.ctx.userid;
     if (!_.isString(userid)) throw new BusinessError(ErrorCode.NOT_LOGIN);

+ 135 - 0
app/service/creeperjljs.js

@@ -0,0 +1,135 @@
+/* eslint-disable strict */
+// const svgCaptcha = require('svg-captcha');
+// const cheerio = require('cheerio');
+// const puppeteer = require('puppeteer');
+const charset = require('superagent-charset');
+const superagent = charset(require('superagent'));
+const cheerio = require('cheerio');
+const fs = require('fs');
+const { CrudService } = require('naf-framework-mongoose/lib/service');
+class CreeperjlsyService extends CrudService {
+
+  async creeper() {
+    // 目标链接 吉林省教师招聘网第一页
+    const targetUrl = 'http://www.zgsydw.com/jilin/jiaoshi/index.html';
+    const columnTitle = '吉林省教师招聘';
+    // 查看是否有这个栏目,没有则创建一个
+    let column = await this.service.column.model.find({ news_type: '0', title: columnTitle });
+    if (column.length === 0) {
+      column = await this.service.column.model.create({ site: '99991', news_type: '0', title: columnTitle, type: '', parent_id: '', parent: '', is_use: '' });
+    } else {
+      column = column[0];
+    }
+    await this.creeperCreate(targetUrl, column);
+  }
+
+
+  // 输入路径返回文本
+  async creeperCreate(targetUrl, column) {
+    // 目标链接 吉林省人事考试网第一页
+    // 用来暂时保存解析到的内容和图片地址数据
+    const hrefOld = targetUrl;
+    const hrefAdd = 'http://www.zgsydw.com';
+    const uri = '';
+    // const imgs = [];
+    // 创建附件文件夹(暂定直接跳到该网站下载)
+    // this.mkdir('./attachment');
+
+    // 发起请求
+    superagent.get(targetUrl).charset('gbk').buffer(true)
+      .end((error, res) => {
+        if (error) { // 请求出错,打印错误,返回
+          console.log(error);
+          return;
+        }
+        // cheerio需要先load html
+        const $ = cheerio.load(res.text);
+        // 循环列表,获取标题、a标签路径、日期,然后根据a标签路径再次进行爬出内容、保存即可
+        $('.ggxx_nr ul li').each((index, element) => {
+        // 这些数据都是存放在news中的
+          const title = $(element).find('a').attr('title');
+          const thisHref = $(element).find('a').attr('href');
+          const time = $(element).find('span').text();
+          // 这里可以给时间做判断当前日期(如果需要的话可以做为判断条件)
+          const nowDate = new Date().toLocaleDateString();
+          if (time !== nowDate) {
+            // 为undefined时,不需要进行下一步了
+            if (thisHref !== undefined) {
+              // 请求内容
+              superagent.get(thisHref).charset('gbk').buffer(true)
+                .end((error, res) => {
+                  if (error) { // 请求出错,打印错误,返回
+                    console.log(error);
+                    return;
+                  }
+                  const $ = cheerio.load(res.text);
+                  // 获取内容保存
+                  // const content = $(element).children('table').clone();
+                  const content = $('.con_l_list').text();
+                  // content.find(':nth-child(n)').remove();
+                  // console.log(content.text());
+                  // 内容中存在翻页可能
+                  const href = $('.lb_page a').attr('href');
+                  if (href !== undefined) {
+                    this.nextPage(href);
+                  }
+
+                  const attachment = [];
+                  // 查看内容是否有翻页
+
+
+                  // 每页都查询是否有附件存在,如果有,下载到本地,保存即可
+                  $('p a').each((index, element) => {
+                    const thisHref = $(element).attr('href');
+                    if (thisHref.substring(0, 4) !== 'http') {
+                      const url = uri + thisHref;
+                      const fileName = $(element).text();
+                      // console.log(fileName);
+                      // console.log(uri);
+                      // const filepath = this.downloadAttachment(url, fileName);
+                      const file = {
+                        name: fileName,
+                        uri: url,
+                      };
+                      attachment.push(file);
+                    }
+                  });
+                  // const news = this.service.news.model.create({
+                  //   site: column.site,
+                  //   title,
+                  //   pic: '',
+                  //   content,
+                  //   type: '',
+                  //   parent_id: column.id,
+                  //   parent: column.title,
+                  //   publish_time: publishTime,
+                  //   attachment,
+                  //   is_use: '0'
+                  // });
+                });
+            }
+          }
+        });
+        // 点击下一页
+        const href = $('#DivPageControl a').eq(2).attr('href');
+        const hrefNew = hrefAdd + href;
+        // 第一次路径与第二次路径比较,相同,就不需要调自己了
+        if (hrefNew !== hrefOld) {
+          // this.creeperCreate(hrefNew, column);
+        // over
+        }
+      });
+  }
+  async nextPage(href) {
+    superagent.get(href).charset('gbk').buffer(true)
+      .end((error, res) => {
+        if (error) { // 请求出错,打印错误,返回
+          console.log(error);
+          return;
+        }
+        const $ = cheerio.load(res.text);
+        
+      });
+  }
+}
+module.exports = CreeperjlsyService;

+ 2 - 2
app/service/creeperjlsy.js

@@ -36,7 +36,7 @@ class CreeperjlsyService extends CrudService {
     // this.mkdir('./attachment');
 
     // 发起请求
-    superagent.get(targetUrl).charset('utf-8').buffer(true)
+    superagent.get(targetUrl).charset('gbk').buffer(true)
       .end((error, res) => {
         if (error) { // 请求出错,打印错误,返回
           console.log(error);
@@ -58,7 +58,7 @@ class CreeperjlsyService extends CrudService {
             if (thisHref !== undefined) {
               const thisAllHref = hrefAdd + thisHref;
               // 请求内容
-              superagent.get(thisAllHref).charset('utf-8').buffer(true)
+              superagent.get(thisAllHref).charset('gbk').buffer(true)
                 .end((error, res) => {
                   if (error) { // 请求出错,打印错误,返回
                     console.log(error);

+ 7 - 4
app/service/menu.js

@@ -12,20 +12,21 @@ class MenuService extends CrudService {
     this.model = this.ctx.model.Menu;
   }
 
-  async create({ site }, { title, type, is_use}) {
+  async create({ site }, { title, type, is_use, url, content_id}) {
     // 检查数据
     assert(_.isString(site), 'site不能为空');
     assert(!title || _.isString(title), 'title必须为字符串');
     assert(!type || _.isString(type), 'type必须为字符串');
     assert(!is_use || _.isString(is_use), 'is_use必须为字符串');
-
+    assert(!url || _.isString(url), 'url必须为字符串');
+    assert(!content_id || _.isString(content_id), 'content_id必须为字符串');
     // TODO: 检查用户信息
     const userid = this.ctx.userid;
     if (!_.isString(userid)) throw new BusinessError(ErrorCode.NOT_LOGIN);
 
     // TODO:保存数据
     const data = {
-      site, title, type, is_use,
+      site, title, type, is_use, url, content_id,
       meta: { createdBy: userid },
     };
 
@@ -35,11 +36,13 @@ class MenuService extends CrudService {
 
   async update({ id }, payload) {
     // 检查数据
-    const { title, type, is_use } = payload;
+    const { title, type, is_use, url, content_id} = payload;
     assert(id, 'id不能为空');
     assert(!title || _.isString(title), 'title必须为字符串');
     assert(!type || _.isString(type), 'type必须为字符串');
     assert(!is_use || _.isString(is_use), 'is_use必须为字符串');
+    assert(!url || _.isString(url), 'url必须为字符串');
+    assert(!content_id || _.isString(content_id), 'content_id必须为字符串');
     // TODO: 检查用户信息
     const userid = this.ctx.userid;
     if (!_.isString(userid)) throw new BusinessError(ErrorCode.NOT_LOGIN);