View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.cli;
19  
20  import com.beust.jcommander.IStringConverter;
21  import com.beust.jcommander.Parameter;
22  import com.beust.jcommander.ParameterException;
23  import com.beust.jcommander.Parameters;
24  import com.beust.jcommander.converters.FileConverter;
25  import org.apache.any23.Any23;
26  import org.apache.any23.configuration.Configuration;
27  import org.apache.any23.configuration.DefaultConfiguration;
28  import org.apache.any23.configuration.Setting;
29  import org.apache.any23.configuration.Settings;
30  import org.apache.any23.extractor.ExtractionParameters;
31  import org.apache.any23.extractor.ExtractionParameters.ValidationMode;
32  import org.apache.any23.extractor.ExtractorFactory;
33  import org.apache.any23.extractor.ExtractorGroup;
34  import org.apache.any23.extractor.ExtractorRegistry;
35  import org.apache.any23.extractor.ExtractorRegistryImpl;
36  import org.apache.any23.filter.IgnoreAccidentalRDFa;
37  import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
38  import org.apache.any23.source.DocumentSource;
39  import org.apache.any23.writer.BenchmarkTripleHandler;
40  import org.apache.any23.writer.DecoratingWriterFactory;
41  import org.apache.any23.writer.TripleWriterFactory;
42  import org.apache.any23.writer.LoggingTripleHandler;
43  import org.apache.any23.writer.NTriplesWriterFactory;
44  import org.apache.any23.writer.ReportingTripleHandler;
45  import org.apache.any23.writer.TripleHandler;
46  import org.apache.any23.writer.TripleHandlerException;
47  import org.apache.any23.writer.WriterFactoryRegistry;
48  import org.slf4j.Logger;
49  import org.slf4j.LoggerFactory;
50  
51  import java.io.File;
52  import java.io.FileNotFoundException;
53  import java.io.FileOutputStream;
54  import java.io.OutputStream;
55  import java.io.OutputStreamWriter;
56  import java.io.PrintStream;
57  import java.io.PrintWriter;
58  import java.io.UnsupportedEncodingException;
59  import java.net.MalformedURLException;
60  import java.net.URL;
61  import java.nio.charset.StandardCharsets;
62  import java.util.Collections;
63  import java.util.LinkedList;
64  import java.util.List;
65  import java.util.ListIterator;
66  import java.util.Locale;
67  import java.util.Objects;
68  
69  import static java.lang.String.format;
70  
71  /**
72   * A default rover implementation. Goes and fetches a URL using an hint as to what format should require, then tries to
73   * convert it to RDF.
74   *
75   * @author Michele Mostarda (mostarda@fbk.eu)
76   * @author Richard Cyganiak (richard@cyganiak.de)
77   * @author Gabriele Renzi
78   * @author Hans Brende (hansbrende@apache.org)
79   */
80  @Parameters(commandNames = { "rover" }, commandDescription = "Apache Any23 Command Line Tool.")
81  public class Rover extends BaseTool {
82  
83      private static final Logger logger = LoggerFactory.getLogger(Rover.class);
84  
85      private static final ExtractorRegistry eRegistry = ExtractorRegistryImpl.getInstance();
86      private static final WriterFactoryRegistry registry = WriterFactoryRegistry.getInstance();
87      private static final String DEFAULT_WRITER_IDENTIFIER = NTriplesWriterFactory.IDENTIFIER;
88  
89      static {
90          final Setting<Boolean> ALWAYS_SUPPRESS_CSS_TRIPLES = Setting.create("alwayssuppresscsstriples", Boolean.TRUE);
91          final Settings supportedSettings = Settings.of(ALWAYS_SUPPRESS_CSS_TRIPLES);
92  
93          registry.register(new DecoratingWriterFactory() {
94  
95              @Override
96              public TripleHandlerrg/apache/any23/writer/TripleHandler.html#TripleHandler">TripleHandler getTripleWriter(TripleHandler delegate, Settings settings) {
97                  boolean always = settings.get(ALWAYS_SUPPRESS_CSS_TRIPLES);
98                  return new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(delegate), always);
99              }
100 
101             @Override
102             public Settings getSupportedSettings() {
103                 return supportedSettings;
104             }
105 
106             @Override
107             public String getIdentifier() {
108                 return "notrivial";
109             }
110         });
111     }
112 
113     @Parameter(names = { "-o",
114             "--output" }, description = "Specify Output file (defaults to standard output)", converter = PrintStreamConverter.class)
115     private PrintStream outputStream = System.out;
116 
117     @Parameter(description = "input IRIs {<url>|<file>}+", converter = ArgumentToIRIConverter.class)
118     protected List<String> inputIRIs = new LinkedList<>();
119 
120     @Parameter(names = { "-e", "--extractors" }, description = "a comma-separated list of extractors, "
121             + "e.g. rdf-xml,rdf-turtle, etc. A complete extractor list can be obtained by calling ./any23 extractor --list")
122     private List<String> extractors = new LinkedList<String>() {
123         {
124             addAll(eRegistry.getAllNames());
125         }
126     };
127 
128     @Parameter(names = { "-f",
129             "--format" }, description = "a comma-separated list of writer factories, e.g. json,jsonld,nquads,notrivial,ntriples,trix,turtle,uri")
130     private List<String> formats = new LinkedList<String>() {
131         {
132             add(DEFAULT_WRITER_IDENTIFIER);
133         }
134     };
135 
136     @Parameter(names = { "-l", "--log" }, description = "Produce log within a file.", converter = FileConverter.class)
137     private File logFile = null;
138 
139     @Parameter(names = { "-s", "--stats" }, description = "Print out extraction statistics.")
140     private boolean statistics;
141 
142     @Parameter(names = { "-t",
143             "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones). [DEPRECATED: As of version 2.3, use --format instead.]")
144     private boolean noTrivial;
145 
146     @Parameter(names = { "-p",
147             "--pedantic" }, description = "Validate and fixes HTML content detecting commons issues.")
148     private boolean pedantic;
149 
150     @Parameter(names = { "-n", "--nesting" }, description = "Disable production of nesting triples.")
151     private boolean nestingDisabled;
152 
153     @Parameter(names = { "-d",
154             "--defaultns" }, description = "Override the default namespace used to produce statements.")
155     private String defaultns;
156 
157     // non parameters
158 
159     private TripleHandler tripleHandler;
160 
161     private ReportingTripleHandler reportingTripleHandler;
162 
163     private BenchmarkTripleHandler benchmarkTripleHandler;
164 
165     private Any23 any23;
166 
167     private ExtractionParameters extractionParameters;
168 
169     @Override
170     PrintStream getOut() {
171         return outputStream;
172     }
173 
174     @Override
175     void setOut(PrintStream out) {
176         outputStream = out;
177     }
178 
179     private static TripleHandler getWriter(String id, OutputStream os) {
180         TripleWriterFactory/../org/apache/any23/writer/TripleWriterFactory.html#TripleWriterFactory">TripleWriterFactory f = (TripleWriterFactory) registry.getWriterByIdentifier(id);
181         Objects.requireNonNull(f,
182                 () -> "Invalid writer id '" + id + "'; admitted values: " + registry.getIdentifiers());
183         return f.getTripleWriter(os, Settings.of()); // TODO parse TripleWriter settings from format list
184     }
185 
186     private static TripleHandlerache/any23/writer/TripleHandler.html#TripleHandler">TripleHandler getWriter(String id, TripleHandler delegate) {
187         DecoratingWriterFactoryorg/apache/any23/writer/DecoratingWriterFactory.html#DecoratingWriterFactory">DecoratingWriterFactory f = (DecoratingWriterFactory) registry.getWriterByIdentifier(id);
188         Objects.requireNonNull(f,
189                 () -> "Invalid writer id '" + id + "'; admitted values: " + registry.getIdentifiers());
190         return f.getTripleWriter(delegate, Settings.of()); // TODO parse delegate settings from format list
191     }
192 
193     protected void configure() {
194         List<String> formats = this.formats;
195         if (formats.isEmpty()) {
196             formats = Collections.singletonList(DEFAULT_WRITER_IDENTIFIER);
197         }
198         ListIterator<String> l = formats.listIterator(formats.size());
199         tripleHandler = getWriter(l.previous(), outputStream);
200 
201         while (l.hasPrevious()) {
202             tripleHandler = getWriter(l.previous(), tripleHandler);
203         }
204 
205         if (logFile != null) {
206             try {
207                 tripleHandler = new LoggingTripleHandler(tripleHandler,
208                         new PrintWriter(new OutputStreamWriter(new FileOutputStream(logFile), StandardCharsets.UTF_8)));
209             } catch (FileNotFoundException fnfe) {
210                 throw new IllegalArgumentException(format(Locale.ROOT, "Can not write to log file [%s]", logFile),
211                         fnfe);
212             }
213         }
214 
215         if (statistics) {
216             benchmarkTripleHandler = new BenchmarkTripleHandler(tripleHandler);
217             tripleHandler = benchmarkTripleHandler;
218         }
219 
220         if (noTrivial) {
221             tripleHandler = new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(tripleHandler), true); // suppress
222                                                                                                              // stylesheet
223                                                                                                              // triples.
224         }
225 
226         reportingTripleHandler = new ReportingTripleHandler(tripleHandler);
227 
228         final Configuration configuration = DefaultConfiguration.singleton();
229         extractionParameters = pedantic
230                 ? new ExtractionParameters(configuration, ValidationMode.VALIDATE_AND_FIX, nestingDisabled)
231                 : new ExtractionParameters(configuration, ValidationMode.NONE, nestingDisabled);
232         if (defaultns != null) {
233             extractionParameters.setProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY, defaultns);
234         }
235 
236         any23 = (extractors.isEmpty()) ? new Any23.html#Any23">Any23() : new Any23(extractors.toArray(new String[extractors.size()]));
237         any23.setHTTPUserAgent(Any23.DEFAULT_HTTP_CLIENT_USER_AGENT + "/" + Any23.VERSION);
238     }
239 
240     protected String printReports() {
241         final StringBuilder sb = new StringBuilder();
242         if (benchmarkTripleHandler != null)
243             sb.append(benchmarkTripleHandler.report()).append('\n');
244         if (reportingTripleHandler != null)
245             sb.append(reportingTripleHandler.printReport()).append('\n');
246         return sb.toString();
247     }
248 
249     protected void performExtraction(DocumentSource documentSource) throws Exception {
250         if (!any23.extract(extractionParameters, documentSource, reportingTripleHandler).hasMatchingExtractors()) {
251             throw new IllegalStateException(
252                     format(Locale.ROOT, "No suitable extractors found for source %s", documentSource.getDocumentIRI()));
253         }
254     }
255 
256     protected void close() {
257         if (tripleHandler != null) {
258             try {
259                 tripleHandler.close();
260             } catch (TripleHandlerException the) {
261                 throw new RuntimeException("Error while closing TripleHandler", the);
262             }
263         }
264 
265         if (outputStream != null && outputStream != System.out) { // TODO: low - find better solution to avoid closing
266                                                                   // system out.
267             outputStream.close();
268         }
269     }
270 
271     public void run() throws Exception {
272         if (inputIRIs.isEmpty()) {
273             throw new IllegalArgumentException("Expected at least 1 argument.");
274         }
275 
276         configure();
277 
278         // perform conversions
279 
280         try {
281             final long start = System.currentTimeMillis();
282             for (String inputIRI : inputIRIs) {
283                 DocumentSource source = any23.createDocumentSource(inputIRI);
284 
285                 performExtraction(source);
286             }
287             final long elapsed = System.currentTimeMillis() - start;
288 
289             if (benchmarkTripleHandler != null) {
290                 System.err.println(benchmarkTripleHandler.report());
291             }
292 
293             logger.info("Extractors used: " + reportingTripleHandler.getExtractorNames());
294             logger.info(reportingTripleHandler.getTotalTriples() + " triples, " + elapsed + "ms");
295         } finally {
296             close();
297         }
298     }
299 
300     public static final class ArgumentToIRIConverter implements IStringConverter<String> {
301 
302         @Override
303         public String convert(String uri) {
304             uri = uri.trim();
305             if (uri.toLowerCase(Locale.ROOT).startsWith("http:") || uri.toLowerCase(Locale.ROOT).startsWith("https:")) {
306                 try {
307                     return new URL(uri).toString();
308                 } catch (MalformedURLException murle) {
309                     throw new ParameterException(format(Locale.ROOT, "Invalid IRI: '%s': %s", uri, murle.getMessage()));
310                 }
311             }
312 
313             final File f = new File(uri);
314             if (!f.exists()) {
315                 throw new ParameterException(format(Locale.ROOT, "No such file: [%s]", f.getAbsolutePath()));
316             }
317             if (f.isDirectory()) {
318                 throw new ParameterException(format(Locale.ROOT, "Found a directory: [%s]", f.getAbsolutePath()));
319             }
320             return f.toURI().toString();
321         }
322 
323     }
324 
325     public static final class PrintStreamConverter implements IStringConverter<PrintStream> {
326 
327         @Override
328         public PrintStream convert(String value) {
329             final File file = new File(value);
330             try {
331                 return new PrintStream(new FileOutputStream(file), true, "UTF-8");
332             } catch (FileNotFoundException fnfe) {
333                 throw new ParameterException(format(Locale.ROOT, "Cannot open file '%s': %s", file, fnfe.getMessage()));
334             } catch (UnsupportedEncodingException e) {
335                 throw new RuntimeException("Error converting to PrintStream with UTF-8 encoding.", e);
336             }
337         }
338 
339     }
340 
341 }