1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.plugin.crawler;
19
20 import edu.uci.ics.crawler4j.crawler.CrawlConfig;
21 import edu.uci.ics.crawler4j.crawler.CrawlController;
22 import edu.uci.ics.crawler4j.crawler.WebCrawler;
23 import edu.uci.ics.crawler4j.fetcher.PageFetcher;
24 import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
25 import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
26
27 import java.io.File;
28 import java.net.URL;
29 import java.util.ArrayList;
30 import java.util.Collections;
31 import java.util.List;
32 import java.util.concurrent.ExecutorService;
33 import java.util.concurrent.Executors;
34 import java.util.regex.Pattern;
35
36
37
38
39
40
41
42 public class SiteCrawler {
43
44 public static final String DEFAULT_PAGE_FILTER_RE =
45 ".*(\\.(" +
46 "css|js" +
47 "|bmp|gif|jpe?g|png|tiff?" +
48 "|mid|mp2|mp3|mp4|wav|wma" +
49 "|avi|mov|mpeg|ram|m4v|wmv|rm|smil" +
50 "|pdf" +
51 "|swf" +
52 "|zip|rar|gz" +
53 "|xml|txt" +
54 "))$";
55
56
57
58
59 public static final int DEFAULT_NUM_OF_CRAWLERS = 10;
60
61
62
63
64 public static final Class<? extends WebCrawler> DEFAULT_WEB_CRAWLER = DefaultWebCrawler.class;
65
66
67
68
69 public final Pattern defaultFilters = Pattern.compile(DEFAULT_PAGE_FILTER_RE);
70
71
72
73
74 private final CrawlController controller;
75
76
77
78
79 private final List<CrawlerListener> listeners = new ArrayList<CrawlerListener>();
80
81
82
83
84 private int numOfCrawlers = DEFAULT_NUM_OF_CRAWLERS;
85
86
87
88
89 private Class<? extends WebCrawler> webCrawler = DEFAULT_WEB_CRAWLER;
90
91
92
93
94 private final CrawlConfig crawlConfig;
95
96
97
98
99 private ExecutorService service;
100
101
102
103
104
105
106 public SiteCrawler(File storageFolder) {
107 try {
108 crawlConfig = new CrawlConfig();
109 crawlConfig.setCrawlStorageFolder( storageFolder.getAbsolutePath() );
110 crawlConfig.setUserAgentString("Apache Any23 Web Crawler");
111
112 final PageFetcher pageFetcher = new PageFetcher(crawlConfig);
113
114 RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
115 final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
116
117 controller = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
118 } catch (Exception e) {
119 throw new IllegalArgumentException("Error while initializing crawler controller.", e);
120 }
121 }
122
123
124
125
126 public int getNumOfCrawlers() {
127 return numOfCrawlers;
128 }
129
130
131
132
133
134
135 public void setNumOfCrawlers(int n) {
136 if(n <=0) throw new IllegalArgumentException("Invalid number of crawlers, must be > 0 .");
137 this.numOfCrawlers = n;
138 }
139
140 public Class<? extends WebCrawler> getWebCrawler() {
141 return webCrawler;
142 }
143
144
145
146
147
148
149 public void setWebCrawler(Class<? extends WebCrawler> c) {
150 if(c == null) throw new NullPointerException("c cannot be null.");
151 this.webCrawler = c;
152 }
153
154
155
156
157 public int getMaxDepth() {
158 return crawlConfig.getMaxDepthOfCrawling();
159 }
160
161
162
163
164
165
166 public void setMaxDepth(int maxDepth) {
167 if(maxDepth < -1 || maxDepth == 0) throw new IllegalArgumentException("Invalid maxDepth, must be -1 or > 0");
168 crawlConfig.setMaxDepthOfCrawling(maxDepth);
169 }
170
171
172
173
174 public int getMaxPages() {
175 return crawlConfig.getMaxPagesToFetch();
176 }
177
178
179
180
181
182
183 public void setMaxPages(int maxPages) {
184 if(maxPages < -1 || maxPages == 0) throw new IllegalArgumentException("Invalid maxPages, must be -1 or > 0");
185 crawlConfig.setMaxPagesToFetch(maxPages);
186 }
187
188
189
190
191 public int getPolitenessDelay() {
192 return crawlConfig.getPolitenessDelay();
193 }
194
195
196
197
198
199
200 public void setPolitenessDelay(int millis) {
201 if(millis >= 0) crawlConfig.setPolitenessDelay(millis);
202 }
203
204
205
206
207
208
209 public void addListener(CrawlerListener listener) {
210 listeners.add(listener);
211 }
212
213
214
215
216
217
218 public void removeListener(CrawlerListener listener) {
219 listeners.remove(listener);
220 }
221
222
223
224
225
226
227
228
229
230 public synchronized void start(
231 final URL seed, final Pattern filters, final boolean wait
232 ) throws Exception {
233 SharedData.setCrawlData(seed.toExternalForm(), filters, Collections.synchronizedList(listeners) );
234 controller.addSeed(seed.toExternalForm());
235 final Runnable internalRunnable = new Runnable() {
236 @Override
237 public void run() {
238 controller.start(getWebCrawler(), getNumOfCrawlers());
239 }
240 };
241 if(wait) {
242 internalRunnable.run();
243 } else {
244 if(service != null) throw new IllegalStateException("Another service seems to run.");
245 service = Executors.newSingleThreadExecutor();
246 service.execute(internalRunnable);
247 }
248 }
249
250
251
252
253
254
255
256
257 public void start(final URL seed, final boolean wait) throws Exception {
258 start(seed, defaultFilters, wait);
259 }
260
261
262
263
264 public synchronized void stop() {
265 service.shutdownNow();
266 }
267
268 }