周浩 9 роки тому
батько
коміт
caf4c089da

+ 49 - 0
hsweb-web-crawler/pom.xml

@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>hsweb-framework</artifactId>
+        <groupId>org.hsweb</groupId>
+        <version>1.0-SNAPSHOT</version>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>hsweb-web-crawler</artifactId>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.hsweb</groupId>
+            <artifactId>hsweb-web-core</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>commons-lang</groupId>
+            <artifactId>commons-lang</artifactId>
+            <version>2.6</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.solr</groupId>
+            <artifactId>solr-solrj</artifactId>
+            <optional>true</optional>
+        </dependency>
+
+        <dependency>
+            <groupId>us.codecraft</groupId>
+            <artifactId>webmagic-core</artifactId>
+            <version>0.5.3</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-log4j12</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>us.codecraft</groupId>
+            <artifactId>webmagic-extension</artifactId>
+            <version>0.5.3</version>
+        </dependency>
+    </dependencies>
+</project>

+ 70 - 0
hsweb-web-crawler/src/main/java/org/hsweb/web/crawler/CrawlerResult.java

@@ -0,0 +1,70 @@
+package org.hsweb.web.crawler;
+
+
+import java.util.Date;
+
+/**
+ * Created by zhouhao on 16-5-4.
+ */
+public class CrawlerResult {
+    private String domain;
+
+    private String url;
+
+    private String content;
+
+    private String html;
+
+    private Date crawlerTime;
+
+    public String getDomain() {
+        return domain;
+    }
+
+    public void setDomain(String domain) {
+        this.domain = domain;
+    }
+
+    public String getUrl() {
+        return url;
+    }
+
+    public void setUrl(String url) {
+        this.url = url;
+    }
+
+    public String getContent() {
+        return content;
+    }
+
+    public void setContent(String content) {
+        this.content = content;
+    }
+
+    public String getHtml() {
+        return html;
+    }
+
+    public void setHtml(String html) {
+        this.html = html;
+    }
+
+    public Date getCrawlerTime() {
+        return crawlerTime;
+    }
+
+    public void setCrawlerTime(Date crawlerTime) {
+        this.crawlerTime = crawlerTime;
+    }
+
+    @Override
+    public String toString() {
+        return "CrawlerResult{" +
+                "domain='" + domain + '\'' +
+                ", url='" + url + '\'' +
+                ", content='" + content + '\'' +
+                ", html='" + html + '\'' +
+                ", crawlerTime=" + crawlerTime +
+                '}';
+    }
+}

+ 74 - 0
hsweb-web-crawler/src/main/java/org/hsweb/web/crawler/extracter/DefaultHtmlContentExtractor.java

@@ -0,0 +1,74 @@
+package org.hsweb.web.crawler.extracter;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class DefaultHtmlContentExtractor implements HtmlContentExtractor {
+    private static String clearLabel(String html) {
+        html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
+        html = html.replaceAll("(?is)<!--.*?-->", "");                // remove html comment
+        html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript
+        html = html.replaceAll("(?is)<style.*?>.*?</style>", "");   // remove css
+        html = html.replaceAll("&.{2,5};|&#.{2,5};", " ");            // remove special char
+        html = html.replaceAll("(?is)<.*?>", "");
+        return html;
+    }
+    private int defaultThreshold=35;
+    public DefaultHtmlContentExtractor(){}
+    public DefaultHtmlContentExtractor(int threshold){
+        this.defaultThreshold=threshold;
+    }
+    @Override
+    public String parse(String html) {
+        html = clearLabel(html);
+        final int blocksWidth = 3;
+        int start, end, threshold = defaultThreshold;
+        StringBuilder text = new StringBuilder();
+        List<Integer> indexDistribution = new ArrayList<>();
+        List<String> lines = Arrays.asList(html.split("\n"));
+        indexDistribution.clear();
+        for (int i = 0; i < lines.size() - blocksWidth; i++) {
+            int wordsNum = 0;
+            for (int j = i; j < i + blocksWidth; j++) {
+                lines.set(j, lines.get(j).replaceAll("\\s+", ""));
+                wordsNum += lines.get(j).length();
+            }
+            indexDistribution.add(wordsNum);
+        }
+        start = -1;
+        end = -1;
+        boolean boolStart = false, boolEnd = false;
+        text.setLength(0);
+        for (int i = 0, len = indexDistribution.size(); i < len - 1; i++) {
+            if (indexDistribution.get(i) > threshold && !boolStart) {
+                if (indexDistribution.get(i + 1).intValue() != 0
+                        || indexDistribution.get(i + 2).intValue() != 0
+                        || indexDistribution.get(i + 3).intValue() != 0) {
+                    boolStart = true;
+                    start = i;
+                    continue;
+                }
+            }
+            if (boolStart) {
+                if (indexDistribution.get(i).intValue() == 0
+                        || indexDistribution.get(i + 1).intValue() == 0) {
+                    end = i;
+                    boolEnd = true;
+                }
+            }
+            if (boolEnd) {
+                StringBuilder tmp = new StringBuilder();
+                for (int ii = start; ii <= end; ii++) {
+                    if (lines.get(ii).length() < 5) continue;
+                    tmp.append(lines.get(ii) + "\n");
+                }
+                String str = tmp.toString();
+                if (str.toLowerCase().contains("copyright") || str.contains("版权所有")) continue;
+                text.append(str);
+                boolStart = boolEnd = false;
+            }
+        }
+        return text.toString();
+    }
+}

+ 9 - 0
hsweb-web-crawler/src/main/java/org/hsweb/web/crawler/extracter/HtmlContentExtractor.java

@@ -0,0 +1,9 @@
+package org.hsweb.web.crawler.extracter;
+
+/**
+ * html正文提取器
+ * Created by 浩 on 2015-09-07 0007.
+ */
+public interface HtmlContentExtractor {
+    String parse(String html);
+}

+ 32 - 0
hsweb-web-crawler/src/main/java/org/hsweb/web/crawler/extracter/JsoupHtmlContentExtractor.java

@@ -0,0 +1,32 @@
+package org.hsweb.web.crawler.extracter;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+/**
+ * Created by 浩 on 2015-09-07 0007.
+ */
+public class JsoupHtmlContentExtractor implements HtmlContentExtractor {
+
+    public JsoupHtmlContentExtractor(String select) {
+        this.select = select;
+    }
+
+    private String select;
+
+    @Override
+    public String parse(String html) {
+        Document document = Jsoup.parse(html);
+        if (select == null)
+            return document.text();
+        return document.select(select).text();
+    }
+
+    public String getSelect() {
+        return select;
+    }
+
+    public void setSelect(String select) {
+        this.select = select;
+    }
+}

+ 49 - 0
hsweb-web-crawler/src/main/java/org/hsweb/web/crawler/pipeline/AbstractPipeline.java

@@ -0,0 +1,49 @@
+package org.hsweb.web.crawler.pipeline;
+
+import org.hsweb.web.crawler.CrawlerResult;
+import org.hsweb.web.crawler.extracter.DefaultHtmlContentExtractor;
+import org.hsweb.web.crawler.extracter.HtmlContentExtractor;
+import org.hsweb.web.crawler.extracter.JsoupHtmlContentExtractor;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.pipeline.Pipeline;
+import us.codecraft.webmagic.utils.UrlUtils;
+
+import java.util.Date;
+
+/**
+ * Created by zhouhao on 16-5-4.
+ */
+public abstract class AbstractPipeline implements Pipeline {
+
+
+    public HtmlContentExtractor extractor;
+
+    public AbstractPipeline() {
+        this(new DefaultHtmlContentExtractor(35));
+    }
+
+    public AbstractPipeline(String selector) {
+        this(new JsoupHtmlContentExtractor(selector));
+    }
+
+    public AbstractPipeline(HtmlContentExtractor extractor) {
+        this.extractor = extractor;
+    }
+
+    @Override
+    public void process(ResultItems resultItems, Task task) {
+        if (resultItems.isSkip()) return;
+        Page page = resultItems.get("page");
+        CrawlerResult result = new CrawlerResult();
+        result.setCrawlerTime(new Date());
+        result.setUrl(page.getUrl().get());
+        result.setDomain(UrlUtils.getDomain(result.getUrl()));
+        result.setHtml(page.getHtml().get());
+        result.setContent(extractor.parse(result.getHtml()));
+        process(result);
+    }
+
+    public abstract void process(CrawlerResult result);
+}

+ 41 - 0
hsweb-web-crawler/src/main/java/org/hsweb/web/crawler/pipeline/SolrPipeline.java

@@ -0,0 +1,41 @@
+package org.hsweb.web.crawler.pipeline;
+
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import org.apache.solr.common.SolrInputDocument;
+import org.hsweb.web.crawler.CrawlerResult;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Created by zhouhao on 16-5-4.
+ */
+public class SolrPipeline extends AbstractPipeline {
+    private SolrServer solrServer;
+    private String core = "hsweb-crawler";
+    private Logger logger = LoggerFactory.getLogger(this.getClass());
+
+    public SolrPipeline(String core) {
+        this.solrServer = new HttpSolrServer("http://127.0.0.1:8983/solr/" + core);
+    }
+
+    public SolrPipeline(SolrServer solrServer) {
+        this.solrServer = solrServer;
+    }
+
+    @Override
+    public void process(CrawlerResult result) {
+        try {
+            logger.debug("save CrawlerResult " + result.getUrl());
+            SolrInputDocument document = new SolrInputDocument();
+            document.addField("url", result.getUrl());
+//            document.addField("html", result.getHtml());
+            document.addField("content_text_cn", result.getContent());
+            document.addField("domain", result.getDomain());
+            solrServer.add(document, 1000);
+        } catch (Exception e) {
+            logger.error("save CrawlerResult error!", e);
+        }
+
+    }
+}

+ 50 - 0
hsweb-web-crawler/src/main/java/org/hsweb/web/crawler/processor/SimplePageProcessor.java

@@ -0,0 +1,50 @@
+package org.hsweb.web.crawler.processor;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.utils.UrlUtils;
+
+import java.util.List;
+
+/**
+ * Created by zhouhao on 16-5-4.
+ */
+public class SimplePageProcessor implements PageProcessor {
+    private Site site;
+    /**
+     * 要抓取的页面
+     */
+    private String crawlerUrlPattern;
+
+    /**
+     * 要保存的页面
+     */
+    private String saveUrlPattern;
+
+
+    public SimplePageProcessor(String crawlerUrlPattern, String saveUrlPattern) {
+        if (site == null)
+            this.site = Site.me().setSleepTime(1000).setRetryTimes(5).setUseGzip(true);
+        this.crawlerUrlPattern = "(" + crawlerUrlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
+        this.saveUrlPattern = "(" + saveUrlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
+    }
+
+    @Override
+    public void process(Page page) {
+        List<String> requests = page.getHtml().links().regex(crawlerUrlPattern).all();
+        page.addTargetRequests(requests);
+        if (!page.getUrl().regex(saveUrlPattern).match())
+            page.setSkip(true);
+        page.putField("page", page);
+    }
+
+    @Override
+    public Site getSite() {
+        return site;
+    }
+
+    public void setSite(Site site) {
+        this.site = site;
+    }
+}

+ 13 - 0
hsweb-web-crawler/src/main/resources/logback.xml

@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<configuration>
+    <!-- 控制台输出日志 -->
+    <appender name="Console" class="ch.qos.logback.core.ConsoleAppender">
+        <encoder>
+            <pattern>%d{HH:mm:ss.SSS} > %-5level %logger{35} - %m%n</pattern>
+        </encoder>
+    </appender>
+    <logger name="org.apache.http" level="ERROR" />
+    <root level="DEBUG">
+        <appender-ref ref="Console"/>
+    </root>
+</configuration>

+ 51 - 0
hsweb-web-crawler/src/test/java/org/hsweb/web/crawler/CrawlerTest.java

@@ -0,0 +1,51 @@
+package org.hsweb.web.crawler;
+
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.util.SimpleOrderedMap;
+import org.hsweb.web.crawler.pipeline.SolrPipeline;
+import org.hsweb.web.crawler.processor.SimplePageProcessor;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.scheduler.QueueScheduler;
+
+/**
+ * Created by zhouhao on 16-5-4.
+ */
+public class CrawlerTest {
+
+    public static void main(String[] args) throws SolrServerException {
+//        Spider.create(new SimplePageProcessor("http://www.yiliu88.com*", "http://www.yiliu88.com/*.html"))
+//                .addUrl("http://www.yiliu88.com")
+//                .setScheduler(new QueueScheduler())
+//                .addPipeline(new SolrPipeline("test"))
+//                .thread(5)
+//                .run();
+//        HttpSolrServer server = new HttpSolrServer("http://127.0.0.1:8983/solr/test");
+//        SolrQuery query = new SolrQuery();
+//        query.setQuery("content_txt_en:hello zhangsan");
+//        //mlt在查询时,打开/关闭 MoreLikeThisComponent 的布尔值
+//        query.setParam("mlt", "true");
+//        //fl 需要返回的字段
+//        query.setParam("fl", "content_txt_en,id");
+//        //mtl.fl 根据哪些字段判断相似度
+//        query.setParam("mlt.fl", "content_txt_en");
+//        //mlt.mintf 最小分词频率,在单个文档中出现频率小于这个值的词将不用于相似判断
+//        query.setParam("mlt.mintf", "1");
+//        //mlt.mindf 最小文档频率,所在文档的个数小于这个值的词将不用于相似判断
+//        query.setParam("mlt.mindf", "1");
+//        query.setParam("hl", "true");
+//        query.setParam("hl.fl","content_txt_en");
+//        query.setParam("hl.simple.pre","<span style='hl'>");
+//        query.setParam("hl.simple.post","</span>");
+//
+//        QueryResponse response = server.query(query);
+//        response.getResponse().forEach(stringObjectEntry -> {
+//            System.out.println(stringObjectEntry);
+//        });
+
+    }
+}

+ 13 - 0
hsweb-web-crawler/src/test/resources/logback.xml

@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<configuration>
+    <!-- 控制台输出日志 -->
+    <appender name="Console" class="ch.qos.logback.core.ConsoleAppender">
+        <encoder>
+            <pattern>%d{HH:mm:ss.SSS} > %-5level %logger{35} - %m%n</pattern>
+        </encoder>
+    </appender>
+    <logger name="org.apache.http" level="ERROR" />
+    <root level="DEBUG">
+        <appender-ref ref="Console"/>
+    </root>
+</configuration>