1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.any23.plugin.crawler;
19
20 import edu.uci.ics.crawler4j.crawler.Page;
21 import edu.uci.ics.crawler4j.crawler.WebCrawler;
22 import edu.uci.ics.crawler4j.url.WebURL;
23 import org.slf4j.Logger;
24 import org.slf4j.LoggerFactory;
25
26 import java.util.regex.Pattern;
27
28 /**
29 * Default {@link WebCrawler} implementation.
30 *
31 * @author Michele Mostarda (mostarda@fbk.eu)
32 */
33 public class DefaultWebCrawler extends WebCrawler {
34
35 private static final Logger logger = LoggerFactory.getLogger(DefaultWebCrawler.class);
36
37 /**
38 * Shared data reference.
39 */
40 private final SharedData sharedData = SharedData.getInstance();
41
42 /**
43 * Page filter pattern.
44 */
45 private final Pattern pattern = sharedData.getPattern();
46
47 /**
48 * Override this method to specify whether the given URL should be visited or not.
49 */
50
51 @Override
52 public boolean shouldVisit(Page referringPage, WebURL url) {
53 if (!super.shouldVisit(referringPage, url))
54 return false;
55 if (url.getURL() == null)
56 return false;
57 final String href = url.getURL().toLowerCase();
58 if (!href.startsWith(sharedData.getSeed()))
59 return false;
60 return pattern == null || !pattern.matcher(href).matches();
61 }
62
63 /**
64 * Override this method to implement the single page processing logic.
65 */
66 @Override
67 public void visit(Page page) {
68 logger.trace("Visiting page: " + page.getWebURL().getURL());
69 sharedData.notifyPage(page);
70 }
71
72 }
73