wuhongyu
/
test


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
							package cn.com.free.util;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.UUID;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.Job;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;

import com.jfinal.kit.JsonKit;
import com.jfinal.kit.Prop;
import com.jfinal.kit.PropKit;

import cn.com.free.model.BannerModel;
import cn.com.free.model.ContentModel;

/***类描述：
*@author: raifei
*@date： 日期：2018年8月20日 时间：下午5:14:26
*@version 1.0
*/
public class DynamicJob3{
    
     public static void execute() throws JobExecutionException {
    	 Prop prop = PropKit.use("url.properties");
		 String url = prop.get("pc.url");
		 webCrawler(url);
     }
    public static void webCrawler(String sr) {
        try {
            //获取所有链接
            String[] strs = sr.split(",");
            for(String url : strs){
                System.out.print(url);
                webCrawler1(url);
            }
       
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    public static void webCrawler1(String url) throws IOException {
           Document document = Jsoup.connect(url)
                   //需要加上userAgent才能伪装成浏览器而不会被网站屏蔽IP
                   //(这种做法可能也会被某些网站拉黑IP一段时间，由于不太稳定到底是不是代码的问题，还在测试中...)
                   .userAgent("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11")
                   //加上cookie信息
                   .cookie("auth", "token")
                   //设置超时
                   .timeout(30000)
                   //用get()方式请求网址，也可以post()方式
                   .get();
           //此处可以文档处理
          // document = Jsoup.parse(document.toString());
           
           
           //获取列表
           Elements elements = document.select("table");
           
           //获取列表循环元素
           for (Element tr : elements.select("tr")) {
               //循环元素二次筛选，筛选到 a标签
               Elements tds = tr.select("a");
               
               //获取a标签,跳转抓取详情
               String href = tds.attr("abs:href");
              
               //提前捕获标题嘛
               
               String title=tr.select("title").text();

               //
               //System.out.print(href);
               
               List<String> imglist=new ArrayList<String>();
              
               //查找缩略图，查找图片
               Elements imgs=tr.select("img");
               for (Element element : imgs) {
                   //获取每个img标签URL "abs:"表示绝对路径
                   String imgSrc = element.attr("abs:src");
                   //String affix_name=imgSrc;
                   String fileExt = imgSrc.substring(imgSrc.lastIndexOf(".") + 1).toLowerCase();
                   String imgpath=UUID.randomUUID().toString().replaceAll("-", "")+"."+fileExt;
                   // 打印URL
                   System.out.println(imgSrc);
                   imglist.add(imgpath);
                   //下载图片到本地
                 downImages(PropKit.get("fileservice"), imgSrc,imgpath);
               }
               webCrawler(href,imglist);
               System.out.print(imgs);
           }
    }
    public static void webCrawler(String uri,List<String> imgList) {
        try {
            //获取整个页面文件
            Document document = Jsoup.connect(uri)
                    //需要加上userAgent才能伪装成浏览器而不会被网站屏蔽IP
                    //(这种做法可能也会被某些网站拉黑IP一段时间，由于不太稳定到底是不是代码的问题，还在测试中...)
                    .userAgent("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11")
                    //加上cookie信息
                    .cookie("auth", "token")
                    //设置超时
                    .timeout(30000)
                    //用get()方式请求网址，也可以post()方式
                    .get();
            //此处可以文档处理
           // document = Jsoup.parse(document.toString());
            
            //获取文章标题
            String source=uri;//文章来源
            String title= document.select("title").text();//文章标题
            String author="";//获取来源
            if(!"".equals("author")){
                author=document.select("author").text();
            }
            String details="";
            //获取详情
            Elements elements = document.select("content");
            //抓取图片并替换链接
            Elements imgs = elements.select("img[src]");
            
            for (Element img : imgs){
                String affix_name=img.attr("abs:src");
                String fileExt = affix_name.substring(affix_name.lastIndexOf(".") + 1).toLowerCase();
                String imgpath=UUID.randomUUID().toString().replaceAll("-", "")+"."+fileExt;
                img.attr("src",PropKit.get("fileservice")+"//"+imgpath);
                //下载图像
                downImages(PropKit.get("fileservice"),affix_name,imgpath);
                
            }
            details=elements.html();
            ContentModel bModel = new ContentModel();
            bModel.set("title", title);
            bModel.set("col_id", 1);
            bModel.set("memo", details);
            bModel.set("type", 0);
            bModel.set("status", 1);
            bModel.save();
           
            //System.out.print(details);
            //获取列表循环元素
            
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    public static void downImages(String filePath, String imgUrl,String newname) {
        // 若指定文件夹没有，则先创建
        File dir = new File(filePath);
        if (!dir.exists()) {
            dir.mkdirs();
        }
        // 截取图片文件名
        String fileName =imgUrl.substring(imgUrl.lastIndexOf('/') + 1, imgUrl.length());
       
        try {
            // 文件名里面可能有中文或者空格，所以这里要进行处理。但空格又会被URLEncoder转义为加号
            String urlTail = URLEncoder.encode(fileName, "UTF-8");
            // 因此要将加号转化为UTF-8格式的%20
            imgUrl = imgUrl.substring(0, imgUrl.lastIndexOf('/') + 1) + urlTail.replaceAll("\\+", "\\%20");

        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        // 写出的路径
        File file = new File(filePath + File.separator + ("".equals(newname)?fileName:newname));

        try {
            // 获取图片URL
            URL url = new URL(imgUrl);
            // 获得连接
            URLConnection connection = url.openConnection();
            // 设置10秒的相应时间
            connection.setConnectTimeout(10 * 1000);
            // 获得输入流
            InputStream in = connection.getInputStream();
            // 获得输出流
            BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
            // 构建缓冲区
            byte[] buf = new byte[1024];
            int size;
            // 写入到文件
            while (-1 != (size = in.read(buf))) {
                out.write(buf, 0, size);
            }
            out.close();
            in.close();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}