View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.cli;
19  
20  import com.beust.jcommander.IStringConverter;
21  import com.beust.jcommander.Parameter;
22  import com.beust.jcommander.ParameterException;
23  import com.beust.jcommander.Parameters;
24  import org.apache.any23.extractor.html.TagSoupParser;
25  import org.apache.any23.http.DefaultHTTPClient;
26  import org.apache.any23.source.DocumentSource;
27  import org.apache.any23.source.FileDocumentSource;
28  import org.apache.any23.source.HTTPDocumentSource;
29  import org.apache.any23.util.StreamUtils;
30  
31  import java.io.File;
32  import java.io.InputStream;
33  import java.io.PrintStream;
34  import java.net.URISyntaxException;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.regex.Matcher;
38  import java.util.regex.Pattern;
39  
40  /**
41   * Command line <i>Microdata</i> parser, accepting both files and URLs and returing a <i>JSON</i> representation of the
42   * extracted metadata as described at <a href="http://www.w3.org/TR/microdata/#json">Microdata JSON Specification</a>.
43   *
44   * @author Michele Mostarda (mostarda@fbk.eu)
45   */
46  @Parameters(commandNames = {
47          "microdata" }, commandDescription = "Commandline Tool for extracting Microdata from file/HTTP source.")
48  public class MicrodataParser extends BaseTool {
49  
50      private static final Pattern HTTP_DOCUMENT_PATTERN = Pattern.compile("^https?://.*");
51  
52      private static final Pattern FILE_DOCUMENT_PATTERN = Pattern.compile("^file:(.*)$");
53  
54      @Parameter(arity = 1, description = "Input document URL, {http://path/to/resource.html|file:/path/to/localFile.html}", converter = MicrodataParserDocumentSourceConverter.class)
55      private List<DocumentSource> document = new LinkedList<DocumentSource>();
56  
57      private PrintStream out = System.out;
58  
59      @Override
60      PrintStream getOut() {
61          return out;
62      }
63  
64      @Override
65      void setOut(PrintStream out) {
66          this.out = out;
67      }
68  
69      public void run() throws Exception {
70          if (document.isEmpty()) {
71              throw new IllegalArgumentException("No input document URL specified");
72          }
73          InputStream documentInputInputStream = null;
74          try {
75              final DocumentSource documentSource = document.get(0);
76              documentInputInputStream = documentSource.openInputStream();
77              final TagSoupParserser.html#TagSoupParser">TagSoupParser tagSoupParser = new TagSoupParser(documentInputInputStream,
78                      documentSource.getDocumentIRI());
79              org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), out);
80          } finally {
81              if (documentInputInputStream != null)
82                  StreamUtils.closeGracefully(documentInputInputStream);
83          }
84      }
85  
86      public static final class MicrodataParserDocumentSourceConverter implements IStringConverter<DocumentSource> {
87  
88          @Override
89          public DocumentSource convert(String value) {
90              final Matcher httpMatcher = HTTP_DOCUMENT_PATTERN.matcher(value);
91              if (httpMatcher.find()) {
92                  try {
93                      return new HTTPDocumentSource(DefaultHTTPClient.createInitializedHTTPClient(), value);
94                  } catch (URISyntaxException e) {
95                      throw new ParameterException("Invalid source IRI: '" + value + "'");
96                  }
97              }
98              final Matcher fileMatcher = FILE_DOCUMENT_PATTERN.matcher(value);
99              if (fileMatcher.find()) {
100                 return new FileDocumentSource(new File(fileMatcher.group(1)));
101             }
102             throw new ParameterException("Invalid source protocol: '" + value + "'");
103         }
104 
105     }
106 
107 }