1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.cli;
19
20 import com.beust.jcommander.IStringConverter;
21 import com.beust.jcommander.Parameter;
22 import com.beust.jcommander.ParameterException;
23 import com.beust.jcommander.Parameters;
24 import com.beust.jcommander.converters.FileConverter;
25 import org.apache.any23.Any23;
26 import org.apache.any23.configuration.Configuration;
27 import org.apache.any23.configuration.DefaultConfiguration;
28 import org.apache.any23.configuration.Setting;
29 import org.apache.any23.configuration.Settings;
30 import org.apache.any23.extractor.ExtractionParameters;
31 import org.apache.any23.extractor.ExtractionParameters.ValidationMode;
32 import org.apache.any23.extractor.ExtractorFactory;
33 import org.apache.any23.extractor.ExtractorGroup;
34 import org.apache.any23.extractor.ExtractorRegistry;
35 import org.apache.any23.extractor.ExtractorRegistryImpl;
36 import org.apache.any23.filter.IgnoreAccidentalRDFa;
37 import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
38 import org.apache.any23.source.DocumentSource;
39 import org.apache.any23.writer.BenchmarkTripleHandler;
40 import org.apache.any23.writer.DecoratingWriterFactory;
41 import org.apache.any23.writer.TripleWriterFactory;
42 import org.apache.any23.writer.LoggingTripleHandler;
43 import org.apache.any23.writer.NTriplesWriterFactory;
44 import org.apache.any23.writer.ReportingTripleHandler;
45 import org.apache.any23.writer.TripleHandler;
46 import org.apache.any23.writer.TripleHandlerException;
47 import org.apache.any23.writer.WriterFactoryRegistry;
48 import org.slf4j.Logger;
49 import org.slf4j.LoggerFactory;
50
51 import java.io.File;
52 import java.io.FileNotFoundException;
53 import java.io.FileOutputStream;
54 import java.io.OutputStream;
55 import java.io.OutputStreamWriter;
56 import java.io.PrintStream;
57 import java.io.PrintWriter;
58 import java.io.UnsupportedEncodingException;
59 import java.net.MalformedURLException;
60 import java.net.URL;
61 import java.nio.charset.StandardCharsets;
62 import java.util.Collections;
63 import java.util.LinkedList;
64 import java.util.List;
65 import java.util.ListIterator;
66 import java.util.Locale;
67 import java.util.Objects;
68
69 import static java.lang.String.format;
70
71
72
73
74
75
76
77
78
79
80 @Parameters(commandNames = { "rover" }, commandDescription = "Apache Any23 Command Line Tool.")
81 public class Rover extends BaseTool {
82
83 private static final Logger logger = LoggerFactory.getLogger(Rover.class);
84
85 private static final ExtractorRegistry eRegistry = ExtractorRegistryImpl.getInstance();
86 private static final WriterFactoryRegistry registry = WriterFactoryRegistry.getInstance();
87 private static final String DEFAULT_WRITER_IDENTIFIER = NTriplesWriterFactory.IDENTIFIER;
88
89 static {
90 final Setting<Boolean> ALWAYS_SUPPRESS_CSS_TRIPLES = Setting.create("alwayssuppresscsstriples", Boolean.TRUE);
91 final Settings supportedSettings = Settings.of(ALWAYS_SUPPRESS_CSS_TRIPLES);
92
93 registry.register(new DecoratingWriterFactory() {
94
95 @Override
96 public TripleHandlerrg/apache/any23/writer/TripleHandler.html#TripleHandler">TripleHandler getTripleWriter(TripleHandler delegate, Settings settings) {
97 boolean always = settings.get(ALWAYS_SUPPRESS_CSS_TRIPLES);
98 return new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(delegate), always);
99 }
100
101 @Override
102 public Settings getSupportedSettings() {
103 return supportedSettings;
104 }
105
106 @Override
107 public String getIdentifier() {
108 return "notrivial";
109 }
110 });
111 }
112
113 @Parameter(names = { "-o",
114 "--output" }, description = "Specify Output file (defaults to standard output)", converter = PrintStreamConverter.class)
115 private PrintStream outputStream = System.out;
116
117 @Parameter(description = "input IRIs {<url>|<file>}+", converter = ArgumentToIRIConverter.class)
118 protected List<String> inputIRIs = new LinkedList<>();
119
120 @Parameter(names = { "-e", "--extractors" }, description = "a comma-separated list of extractors, "
121 + "e.g. rdf-xml,rdf-turtle, etc. A complete extractor list can be obtained by calling ./any23 extractor --list")
122 private List<String> extractors = new LinkedList<String>() {
123 {
124 addAll(eRegistry.getAllNames());
125 }
126 };
127
128 @Parameter(names = { "-f",
129 "--format" }, description = "a comma-separated list of writer factories, e.g. json,jsonld,nquads,notrivial,ntriples,trix,turtle,uri")
130 private List<String> formats = new LinkedList<String>() {
131 {
132 add(DEFAULT_WRITER_IDENTIFIER);
133 }
134 };
135
136 @Parameter(names = { "-l", "--log" }, description = "Produce log within a file.", converter = FileConverter.class)
137 private File logFile = null;
138
139 @Parameter(names = { "-s", "--stats" }, description = "Print out extraction statistics.")
140 private boolean statistics;
141
142 @Parameter(names = { "-t",
143 "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones). [DEPRECATED: As of version 2.3, use --format instead.]")
144 private boolean noTrivial;
145
146 @Parameter(names = { "-p",
147 "--pedantic" }, description = "Validate and fixes HTML content detecting commons issues.")
148 private boolean pedantic;
149
150 @Parameter(names = { "-n", "--nesting" }, description = "Disable production of nesting triples.")
151 private boolean nestingDisabled;
152
153 @Parameter(names = { "-d",
154 "--defaultns" }, description = "Override the default namespace used to produce statements.")
155 private String defaultns;
156
157
158
159 private TripleHandler tripleHandler;
160
161 private ReportingTripleHandler reportingTripleHandler;
162
163 private BenchmarkTripleHandler benchmarkTripleHandler;
164
165 private Any23 any23;
166
167 private ExtractionParameters extractionParameters;
168
169 @Override
170 PrintStream getOut() {
171 return outputStream;
172 }
173
174 @Override
175 void setOut(PrintStream out) {
176 outputStream = out;
177 }
178
179 private static TripleHandler getWriter(String id, OutputStream os) {
180 TripleWriterFactory/../org/apache/any23/writer/TripleWriterFactory.html#TripleWriterFactory">TripleWriterFactory f = (TripleWriterFactory) registry.getWriterByIdentifier(id);
181 Objects.requireNonNull(f,
182 () -> "Invalid writer id '" + id + "'; admitted values: " + registry.getIdentifiers());
183 return f.getTripleWriter(os, Settings.of());
184 }
185
186 private static TripleHandlerache/any23/writer/TripleHandler.html#TripleHandler">TripleHandler getWriter(String id, TripleHandler delegate) {
187 DecoratingWriterFactoryorg/apache/any23/writer/DecoratingWriterFactory.html#DecoratingWriterFactory">DecoratingWriterFactory f = (DecoratingWriterFactory) registry.getWriterByIdentifier(id);
188 Objects.requireNonNull(f,
189 () -> "Invalid writer id '" + id + "'; admitted values: " + registry.getIdentifiers());
190 return f.getTripleWriter(delegate, Settings.of());
191 }
192
193 protected void configure() {
194 List<String> formats = this.formats;
195 if (formats.isEmpty()) {
196 formats = Collections.singletonList(DEFAULT_WRITER_IDENTIFIER);
197 }
198 ListIterator<String> l = formats.listIterator(formats.size());
199 tripleHandler = getWriter(l.previous(), outputStream);
200
201 while (l.hasPrevious()) {
202 tripleHandler = getWriter(l.previous(), tripleHandler);
203 }
204
205 if (logFile != null) {
206 try {
207 tripleHandler = new LoggingTripleHandler(tripleHandler,
208 new PrintWriter(new OutputStreamWriter(new FileOutputStream(logFile), StandardCharsets.UTF_8)));
209 } catch (FileNotFoundException fnfe) {
210 throw new IllegalArgumentException(format(Locale.ROOT, "Can not write to log file [%s]", logFile),
211 fnfe);
212 }
213 }
214
215 if (statistics) {
216 benchmarkTripleHandler = new BenchmarkTripleHandler(tripleHandler);
217 tripleHandler = benchmarkTripleHandler;
218 }
219
220 if (noTrivial) {
221 tripleHandler = new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(tripleHandler), true);
222
223
224 }
225
226 reportingTripleHandler = new ReportingTripleHandler(tripleHandler);
227
228 final Configuration configuration = DefaultConfiguration.singleton();
229 extractionParameters = pedantic
230 ? new ExtractionParameters(configuration, ValidationMode.VALIDATE_AND_FIX, nestingDisabled)
231 : new ExtractionParameters(configuration, ValidationMode.NONE, nestingDisabled);
232 if (defaultns != null) {
233 extractionParameters.setProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY, defaultns);
234 }
235
236 any23 = (extractors.isEmpty()) ? new Any23.html#Any23">Any23() : new Any23(extractors.toArray(new String[extractors.size()]));
237 any23.setHTTPUserAgent(Any23.DEFAULT_HTTP_CLIENT_USER_AGENT + "/" + Any23.VERSION);
238 }
239
240 protected String printReports() {
241 final StringBuilder sb = new StringBuilder();
242 if (benchmarkTripleHandler != null)
243 sb.append(benchmarkTripleHandler.report()).append('\n');
244 if (reportingTripleHandler != null)
245 sb.append(reportingTripleHandler.printReport()).append('\n');
246 return sb.toString();
247 }
248
249 protected void performExtraction(DocumentSource documentSource) throws Exception {
250 if (!any23.extract(extractionParameters, documentSource, reportingTripleHandler).hasMatchingExtractors()) {
251 throw new IllegalStateException(
252 format(Locale.ROOT, "No suitable extractors found for source %s", documentSource.getDocumentIRI()));
253 }
254 }
255
256 protected void close() {
257 if (tripleHandler != null) {
258 try {
259 tripleHandler.close();
260 } catch (TripleHandlerException the) {
261 throw new RuntimeException("Error while closing TripleHandler", the);
262 }
263 }
264
265 if (outputStream != null && outputStream != System.out) {
266
267 outputStream.close();
268 }
269 }
270
271 public void run() throws Exception {
272 if (inputIRIs.isEmpty()) {
273 throw new IllegalArgumentException("Expected at least 1 argument.");
274 }
275
276 configure();
277
278
279
280 try {
281 final long start = System.currentTimeMillis();
282 for (String inputIRI : inputIRIs) {
283 DocumentSource source = any23.createDocumentSource(inputIRI);
284
285 performExtraction(source);
286 }
287 final long elapsed = System.currentTimeMillis() - start;
288
289 if (benchmarkTripleHandler != null) {
290 System.err.println(benchmarkTripleHandler.report());
291 }
292
293 logger.info("Extractors used: " + reportingTripleHandler.getExtractorNames());
294 logger.info(reportingTripleHandler.getTotalTriples() + " triples, " + elapsed + "ms");
295 } finally {
296 close();
297 }
298 }
299
300 public static final class ArgumentToIRIConverter implements IStringConverter<String> {
301
302 @Override
303 public String convert(String uri) {
304 uri = uri.trim();
305 if (uri.toLowerCase(Locale.ROOT).startsWith("http:") || uri.toLowerCase(Locale.ROOT).startsWith("https:")) {
306 try {
307 return new URL(uri).toString();
308 } catch (MalformedURLException murle) {
309 throw new ParameterException(format(Locale.ROOT, "Invalid IRI: '%s': %s", uri, murle.getMessage()));
310 }
311 }
312
313 final File f = new File(uri);
314 if (!f.exists()) {
315 throw new ParameterException(format(Locale.ROOT, "No such file: [%s]", f.getAbsolutePath()));
316 }
317 if (f.isDirectory()) {
318 throw new ParameterException(format(Locale.ROOT, "Found a directory: [%s]", f.getAbsolutePath()));
319 }
320 return f.toURI().toString();
321 }
322
323 }
324
325 public static final class PrintStreamConverter implements IStringConverter<PrintStream> {
326
327 @Override
328 public PrintStream convert(String value) {
329 final File file = new File(value);
330 try {
331 return new PrintStream(new FileOutputStream(file), true, "UTF-8");
332 } catch (FileNotFoundException fnfe) {
333 throw new ParameterException(format(Locale.ROOT, "Cannot open file '%s': %s", file, fnfe.getMessage()));
334 } catch (UnsupportedEncodingException e) {
335 throw new RuntimeException("Error converting to PrintStream with UTF-8 encoding.", e);
336 }
337 }
338
339 }
340
341 }