1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.any23.plugin.crawler; 19 20 import edu.uci.ics.crawler4j.crawler.Page; 21 22 import java.util.List; 23 import java.util.regex.Pattern; 24 25 /** 26 * This class hosts shared data structures accessible 27 * to all the {@link DefaultWebCrawler} instances 28 * run by the {@link SiteCrawler}. 29 * 30 * @author Michele Mostarda (mostarda@fbk.eu) 31 */ 32 public class SharedData { 33 34 /** 35 * Singleton instance. 36 */ 37 private static SharedData instance; 38 39 /** 40 * Crawl seed. 41 */ 42 private final String seed; 43 44 /** 45 * Crawl page filter pattern. 46 */ 47 private final Pattern pattern; 48 49 /** 50 * List of crawler listeners. 51 */ 52 private final List<CrawlerListener> listeners; 53 54 // /** 55 // * Output triple handler. 56 // */ 57 // private final TripleHandler tripleHandler; 58 59 /** 60 * @return the singleton instance. 61 */ 62 protected static SharedData getInstance() { 63 if(instance == null) throw new IllegalStateException("The configuration has not yet initialized."); 64 return instance; 65 } 66 67 /** 68 * Initializes the crawler data. 69 * 70 * @param seed crawler seed. 71 * @param regex page filter regex. 72 * @param listeners the listeners to be notified of the crawler activity. 73 */ 74 protected static void setCrawlData(String seed, Pattern regex, List<CrawlerListener> listeners) { 75 instance = new SharedData(seed, regex, listeners); 76 } 77 78 /** 79 * Internal constructor. 80 * 81 * @param seed 82 * @param pattern 83 * @param listeners 84 */ 85 private SharedData(String seed, Pattern pattern, List<CrawlerListener> listeners) { 86 if(seed == null || seed.trim().length() == 0) 87 throw new IllegalArgumentException( 88 String.format("Invalid seed '%s'", seed) 89 ); 90 91 this.seed = seed; 92 this.pattern = pattern; 93 this.listeners = listeners; 94 } 95 96 /** 97 * @return crawl seed. 98 */ 99 protected String getSeed() { 100 return seed; 101 } 102 103 /** 104 * @return page filter pattern. 105 */ 106 protected Pattern getPattern() { 107 return pattern; 108 } 109 110 /** 111 * Notifies all listeners that a page has been discovered. 112 * 113 * @param page the discovered page. 114 */ 115 protected void notifyPage(Page page) { 116 for(CrawlerListener listener : listeners) { 117 listener.visitedPage(page); 118 } 119 } 120 121 }