123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- package cn.com.free.util;
- import java.io.BufferedOutputStream;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.UnsupportedEncodingException;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.net.URLConnection;
- import java.net.URLEncoder;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.UUID;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import org.quartz.Job;
- import org.quartz.JobExecutionContext;
- import org.quartz.JobExecutionException;
- import com.jfinal.kit.JsonKit;
- import com.jfinal.kit.Prop;
- import com.jfinal.kit.PropKit;
- import cn.com.free.model.BannerModel;
- import cn.com.free.model.ContentModel;
- /***类描述:
- *@author: raifei
- *@date: 日期:2018年8月20日 时间:下午5:14:26
- *@version 1.0
- */
- public class DynamicJob3{
-
- public static void execute() throws JobExecutionException {
- Prop prop = PropKit.use("url.properties");
- String url = prop.get("pc.url");
- webCrawler(url);
- }
- public static void webCrawler(String sr) {
- try {
- //获取所有链接
- String[] strs = sr.split(",");
- for(String url : strs){
- System.out.print(url);
- webCrawler1(url);
- }
-
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static void webCrawler1(String url) throws IOException {
- Document document = Jsoup.connect(url)
- //需要加上userAgent才能伪装成浏览器而不会被网站屏蔽IP
- //(这种做法可能也会被某些网站拉黑IP一段时间,由于不太稳定到底是不是代码的问题,还在测试中...)
- .userAgent("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11")
- //加上cookie信息
- .cookie("auth", "token")
- //设置超时
- .timeout(30000)
- //用get()方式请求网址,也可以post()方式
- .get();
- //此处可以文档处理
- // document = Jsoup.parse(document.toString());
-
-
- //获取列表
- Elements elements = document.select("table");
-
- //获取列表循环元素
- for (Element tr : elements.select("tr")) {
- //循环元素二次筛选,筛选到 a标签
- Elements tds = tr.select("a");
-
- //获取a标签,跳转抓取详情
- String href = tds.attr("abs:href");
-
- //提前捕获标题嘛
-
- String title=tr.select("title").text();
- //
- //System.out.print(href);
-
- List<String> imglist=new ArrayList<String>();
-
- //查找缩略图,查找图片
- Elements imgs=tr.select("img");
- for (Element element : imgs) {
- //获取每个img标签URL "abs:"表示绝对路径
- String imgSrc = element.attr("abs:src");
- //String affix_name=imgSrc;
- String fileExt = imgSrc.substring(imgSrc.lastIndexOf(".") + 1).toLowerCase();
- String imgpath=UUID.randomUUID().toString().replaceAll("-", "")+"."+fileExt;
- // 打印URL
- System.out.println(imgSrc);
- imglist.add(imgpath);
- //下载图片到本地
- downImages(PropKit.get("fileservice"), imgSrc,imgpath);
- }
- webCrawler(href,imglist);
- System.out.print(imgs);
- }
- }
- public static void webCrawler(String uri,List<String> imgList) {
- try {
- //获取整个页面文件
- Document document = Jsoup.connect(uri)
- //需要加上userAgent才能伪装成浏览器而不会被网站屏蔽IP
- //(这种做法可能也会被某些网站拉黑IP一段时间,由于不太稳定到底是不是代码的问题,还在测试中...)
- .userAgent("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11")
- //加上cookie信息
- .cookie("auth", "token")
- //设置超时
- .timeout(30000)
- //用get()方式请求网址,也可以post()方式
- .get();
- //此处可以文档处理
- // document = Jsoup.parse(document.toString());
-
- //获取文章标题
- String source=uri;//文章来源
- String title= document.select("title").text();//文章标题
- String author="";//获取来源
- if(!"".equals("author")){
- author=document.select("author").text();
- }
- String details="";
- //获取详情
- Elements elements = document.select("content");
- //抓取图片并替换链接
- Elements imgs = elements.select("img[src]");
-
- for (Element img : imgs){
- String affix_name=img.attr("abs:src");
- String fileExt = affix_name.substring(affix_name.lastIndexOf(".") + 1).toLowerCase();
- String imgpath=UUID.randomUUID().toString().replaceAll("-", "")+"."+fileExt;
- img.attr("src",PropKit.get("fileservice")+"//"+imgpath);
- //下载图像
- downImages(PropKit.get("fileservice"),affix_name,imgpath);
-
- }
- details=elements.html();
- ContentModel bModel = new ContentModel();
- bModel.set("title", title);
- bModel.set("col_id", 1);
- bModel.set("memo", details);
- bModel.set("type", 0);
- bModel.set("status", 1);
- bModel.save();
-
- //System.out.print(details);
- //获取列表循环元素
-
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- public static void downImages(String filePath, String imgUrl,String newname) {
- // 若指定文件夹没有,则先创建
- File dir = new File(filePath);
- if (!dir.exists()) {
- dir.mkdirs();
- }
- // 截取图片文件名
- String fileName =imgUrl.substring(imgUrl.lastIndexOf('/') + 1, imgUrl.length());
-
- try {
- // 文件名里面可能有中文或者空格,所以这里要进行处理。但空格又会被URLEncoder转义为加号
- String urlTail = URLEncoder.encode(fileName, "UTF-8");
- // 因此要将加号转化为UTF-8格式的%20
- imgUrl = imgUrl.substring(0, imgUrl.lastIndexOf('/') + 1) + urlTail.replaceAll("\\+", "\\%20");
- } catch (UnsupportedEncodingException e) {
- e.printStackTrace();
- }
- // 写出的路径
- File file = new File(filePath + File.separator + ("".equals(newname)?fileName:newname));
- try {
- // 获取图片URL
- URL url = new URL(imgUrl);
- // 获得连接
- URLConnection connection = url.openConnection();
- // 设置10秒的相应时间
- connection.setConnectTimeout(10 * 1000);
- // 获得输入流
- InputStream in = connection.getInputStream();
- // 获得输出流
- BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
- // 构建缓冲区
- byte[] buf = new byte[1024];
- int size;
- // 写入到文件
- while (-1 != (size = in.read(buf))) {
- out.write(buf, 0, size);
- }
- out.close();
- in.close();
- } catch (MalformedURLException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
|