1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.cli;
19
20 import com.beust.jcommander.IStringConverter;
21 import com.beust.jcommander.Parameter;
22 import com.beust.jcommander.ParameterException;
23 import com.beust.jcommander.Parameters;
24 import com.beust.jcommander.converters.FileConverter;
25 import edu.uci.ics.crawler4j.crawler.Page;
26 import edu.uci.ics.crawler4j.parser.HtmlParseData;
27 import edu.uci.ics.crawler4j.parser.ParseData;
28 import org.apache.any23.plugin.crawler.CrawlerListener;
29 import org.apache.any23.plugin.crawler.SiteCrawler;
30 import org.apache.any23.source.StringDocumentSource;
31
32 import java.io.File;
33 import java.net.URL;
34 import java.util.UUID;
35 import java.util.regex.Pattern;
36 import java.util.regex.PatternSyntaxException;
37
38 import static java.lang.String.format;
39
40
41
42
43
44
45
46 @Parameters(commandNames = "crawler", commandDescription = "Any23 Crawler Command Line Tool.")
47 public class Crawler extends Rover {
48
49 private final Object roverLock = new Object();
50
51 @Parameter(
52 names = { "-pf", "--pagefilter" },
53 description = "Regex used to filter out page URLs during crawling.",
54 converter = PatterConverter.class
55 )
56 private Pattern pageFilter = Pattern.compile( SiteCrawler.DEFAULT_PAGE_FILTER_RE );
57
58 @Parameter(
59 names = { "-sf", "--storagefolder" },
60 description = "Folder used to store crawler temporary data.",
61 converter = FileConverter.class
62 )
63 private File storageFolder = new File(System.getProperty("java.io.tmpdir"), "crawler-metadata-" + UUID.randomUUID().toString());
64
65 @Parameter(names = { "-nc", "--numcrawlers" }, description = "Sets the number of crawlers.")
66 private int numCrawlers = SiteCrawler.DEFAULT_NUM_OF_CRAWLERS;
67
68 @Parameter(names = { "-mp", "--maxpages" }, description = "Max number of pages before interrupting crawl.")
69 private int maxPages = Integer.MAX_VALUE;
70
71 @Parameter(names = { "-md", "--maxdepth" }, description = "Max allowed crawler depth.")
72 private int maxDepth = Integer.MAX_VALUE;
73
74 @Parameter(names = { "-pd", "--politenessdelay" }, description = "Politeness delay in milliseconds.")
75 private int politenessDelay = Integer.MAX_VALUE;
76
77 @Override
78 public void run() throws Exception {
79 super.configure();
80
81 if (inputIRIs.size() != 1) {
82 throw new IllegalArgumentException("Expected just one seed.");
83 }
84 final URL seed = new URL(inputIRIs.get( 0 ));
85
86 if ( storageFolder.isFile() ) {
87 throw new IllegalStateException( format( "Storage folder %s can not be a file, must be a directory",
88 storageFolder ) );
89 }
90
91 if ( !storageFolder.exists() ) {
92 if ( !storageFolder.mkdirs() ) {
93 throw new IllegalStateException(
94 format( "Storage folder %s can not be created, please verify you have enough permissions",
95 storageFolder ) );
96 }
97 }
98
99 final SiteCrawlerawler.html#SiteCrawler">SiteCrawler siteCrawler = new SiteCrawler( storageFolder );
100 siteCrawler.setNumOfCrawlers( numCrawlers );
101 siteCrawler.setMaxPages( maxPages );
102 siteCrawler.setMaxDepth( maxDepth );
103 siteCrawler.setPolitenessDelay(politenessDelay);
104
105 siteCrawler.addListener(new CrawlerListener() {
106 @Override
107 public void visitedPage(Page page) {
108 final String pageURL = page.getWebURL().getURL();
109 System.err.println( format("Processing page: [%s]", pageURL) );
110
111 final ParseData parseData = page.getParseData();
112 if (parseData instanceof HtmlParseData) {
113 final HtmlParseData htmlParseData = (HtmlParseData) parseData;
114 try {
115 synchronized (roverLock) {
116 Crawler.super.performExtraction(
117 new StringDocumentSource(
118 htmlParseData.getHtml(),
119 pageURL
120
121 )
122 );
123 }
124 } catch (Exception e) {
125 System.err.println(format("Error while processing page [%s], error: %s .",
126 pageURL, e.getMessage())
127 );
128 }
129 }
130 }
131 });
132
133 Runtime.getRuntime().addShutdownHook( new Thread() {
134 @Override
135 public void run() {
136 try {
137 System.err.println( Crawler.super.printReports() );
138
139 } catch (Exception e) {
140 e.printStackTrace(System.err);
141 }
142 }
143 });
144 siteCrawler.start(seed, pageFilter, true);
145 }
146
147 public static final class PatterConverter implements IStringConverter<Pattern> {
148
149 @Override
150 public Pattern convert( String value ) {
151 try {
152 return Pattern.compile( value );
153 } catch (PatternSyntaxException pse) {
154 throw new ParameterException( format("Invalid page filter, '%s' must be a regular expression.", value) );
155 }
156 }
157
158 }
159
160 }