DynamicJob3.java 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. package cn.com.free.util;
  2. import java.io.BufferedOutputStream;
  3. import java.io.File;
  4. import java.io.FileOutputStream;
  5. import java.io.IOException;
  6. import java.io.InputStream;
  7. import java.io.UnsupportedEncodingException;
  8. import java.net.MalformedURLException;
  9. import java.net.URL;
  10. import java.net.URLConnection;
  11. import java.net.URLEncoder;
  12. import java.util.ArrayList;
  13. import java.util.Date;
  14. import java.util.List;
  15. import java.util.UUID;
  16. import org.jsoup.Jsoup;
  17. import org.jsoup.nodes.Document;
  18. import org.jsoup.nodes.Element;
  19. import org.jsoup.select.Elements;
  20. import org.quartz.Job;
  21. import org.quartz.JobExecutionContext;
  22. import org.quartz.JobExecutionException;
  23. import com.jfinal.kit.JsonKit;
  24. import com.jfinal.kit.Prop;
  25. import com.jfinal.kit.PropKit;
  26. import cn.com.free.model.BannerModel;
  27. import cn.com.free.model.ContentModel;
  28. /***类描述:
  29. *@author: raifei
  30. *@date: 日期:2018年8月20日 时间:下午5:14:26
  31. *@version 1.0
  32. */
  33. public class DynamicJob3{
  34. public static void execute() throws JobExecutionException {
  35. Prop prop = PropKit.use("url.properties");
  36. String url = prop.get("pc.url");
  37. webCrawler(url);
  38. }
  39. public static void webCrawler(String sr) {
  40. try {
  41. //获取所有链接
  42. String[] strs = sr.split(",");
  43. for(String url : strs){
  44. System.out.print(url);
  45. webCrawler1(url);
  46. }
  47. } catch (Exception e) {
  48. e.printStackTrace();
  49. }
  50. }
  51. public static void webCrawler1(String url) throws IOException {
  52. Document document = Jsoup.connect(url)
  53. //需要加上userAgent才能伪装成浏览器而不会被网站屏蔽IP
  54. //(这种做法可能也会被某些网站拉黑IP一段时间,由于不太稳定到底是不是代码的问题,还在测试中...)
  55. .userAgent("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11")
  56. //加上cookie信息
  57. .cookie("auth", "token")
  58. //设置超时
  59. .timeout(30000)
  60. //用get()方式请求网址,也可以post()方式
  61. .get();
  62. //此处可以文档处理
  63. // document = Jsoup.parse(document.toString());
  64. //获取列表
  65. Elements elements = document.select("table");
  66. //获取列表循环元素
  67. for (Element tr : elements.select("tr")) {
  68. //循环元素二次筛选,筛选到 a标签
  69. Elements tds = tr.select("a");
  70. //获取a标签,跳转抓取详情
  71. String href = tds.attr("abs:href");
  72. //提前捕获标题嘛
  73. String title=tr.select("title").text();
  74. //
  75. //System.out.print(href);
  76. List<String> imglist=new ArrayList<String>();
  77. //查找缩略图,查找图片
  78. Elements imgs=tr.select("img");
  79. for (Element element : imgs) {
  80. //获取每个img标签URL "abs:"表示绝对路径
  81. String imgSrc = element.attr("abs:src");
  82. //String affix_name=imgSrc;
  83. String fileExt = imgSrc.substring(imgSrc.lastIndexOf(".") + 1).toLowerCase();
  84. String imgpath=UUID.randomUUID().toString().replaceAll("-", "")+"."+fileExt;
  85. // 打印URL
  86. System.out.println(imgSrc);
  87. imglist.add(imgpath);
  88. //下载图片到本地
  89. downImages(PropKit.get("fileservice"), imgSrc,imgpath);
  90. }
  91. webCrawler(href,imglist);
  92. System.out.print(imgs);
  93. }
  94. }
  95. public static void webCrawler(String uri,List<String> imgList) {
  96. try {
  97. //获取整个页面文件
  98. Document document = Jsoup.connect(uri)
  99. //需要加上userAgent才能伪装成浏览器而不会被网站屏蔽IP
  100. //(这种做法可能也会被某些网站拉黑IP一段时间,由于不太稳定到底是不是代码的问题,还在测试中...)
  101. .userAgent("User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11")
  102. //加上cookie信息
  103. .cookie("auth", "token")
  104. //设置超时
  105. .timeout(30000)
  106. //用get()方式请求网址,也可以post()方式
  107. .get();
  108. //此处可以文档处理
  109. // document = Jsoup.parse(document.toString());
  110. //获取文章标题
  111. String source=uri;//文章来源
  112. String title= document.select("title").text();//文章标题
  113. String author="";//获取来源
  114. if(!"".equals("author")){
  115. author=document.select("author").text();
  116. }
  117. String details="";
  118. //获取详情
  119. Elements elements = document.select("content");
  120. //抓取图片并替换链接
  121. Elements imgs = elements.select("img[src]");
  122. for (Element img : imgs){
  123. String affix_name=img.attr("abs:src");
  124. String fileExt = affix_name.substring(affix_name.lastIndexOf(".") + 1).toLowerCase();
  125. String imgpath=UUID.randomUUID().toString().replaceAll("-", "")+"."+fileExt;
  126. img.attr("src",PropKit.get("fileservice")+"//"+imgpath);
  127. //下载图像
  128. downImages(PropKit.get("fileservice"),affix_name,imgpath);
  129. }
  130. details=elements.html();
  131. ContentModel bModel = new ContentModel();
  132. bModel.set("title", title);
  133. bModel.set("col_id", 1);
  134. bModel.set("memo", details);
  135. bModel.set("type", 0);
  136. bModel.set("status", 1);
  137. bModel.save();
  138. //System.out.print(details);
  139. //获取列表循环元素
  140. } catch (IOException e) {
  141. e.printStackTrace();
  142. }
  143. }
  144. public static void downImages(String filePath, String imgUrl,String newname) {
  145. // 若指定文件夹没有,则先创建
  146. File dir = new File(filePath);
  147. if (!dir.exists()) {
  148. dir.mkdirs();
  149. }
  150. // 截取图片文件名
  151. String fileName =imgUrl.substring(imgUrl.lastIndexOf('/') + 1, imgUrl.length());
  152. try {
  153. // 文件名里面可能有中文或者空格,所以这里要进行处理。但空格又会被URLEncoder转义为加号
  154. String urlTail = URLEncoder.encode(fileName, "UTF-8");
  155. // 因此要将加号转化为UTF-8格式的%20
  156. imgUrl = imgUrl.substring(0, imgUrl.lastIndexOf('/') + 1) + urlTail.replaceAll("\\+", "\\%20");
  157. } catch (UnsupportedEncodingException e) {
  158. e.printStackTrace();
  159. }
  160. // 写出的路径
  161. File file = new File(filePath + File.separator + ("".equals(newname)?fileName:newname));
  162. try {
  163. // 获取图片URL
  164. URL url = new URL(imgUrl);
  165. // 获得连接
  166. URLConnection connection = url.openConnection();
  167. // 设置10秒的相应时间
  168. connection.setConnectTimeout(10 * 1000);
  169. // 获得输入流
  170. InputStream in = connection.getInputStream();
  171. // 获得输出流
  172. BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
  173. // 构建缓冲区
  174. byte[] buf = new byte[1024];
  175. int size;
  176. // 写入到文件
  177. while (-1 != (size = in.read(buf))) {
  178. out.write(buf, 0, size);
  179. }
  180. out.close();
  181. in.close();
  182. } catch (MalformedURLException e) {
  183. e.printStackTrace();
  184. } catch (IOException e) {
  185. e.printStackTrace();
  186. }
  187. }
  188. }