1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.cli;
19
20 import com.beust.jcommander.IStringConverter;
21 import com.beust.jcommander.Parameter;
22 import com.beust.jcommander.ParameterException;
23 import com.beust.jcommander.Parameters;
24 import org.apache.any23.extractor.html.TagSoupParser;
25 import org.apache.any23.http.DefaultHTTPClient;
26 import org.apache.any23.source.DocumentSource;
27 import org.apache.any23.source.FileDocumentSource;
28 import org.apache.any23.source.HTTPDocumentSource;
29 import org.apache.any23.util.StreamUtils;
30
31 import java.io.File;
32 import java.io.InputStream;
33 import java.io.PrintStream;
34 import java.net.URISyntaxException;
35 import java.util.LinkedList;
36 import java.util.List;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
39
40
41
42
43
44
45
46 @Parameters(commandNames = {
47 "microdata" }, commandDescription = "Commandline Tool for extracting Microdata from file/HTTP source.")
48 public class MicrodataParser extends BaseTool {
49
50 private static final Pattern HTTP_DOCUMENT_PATTERN = Pattern.compile("^https?://.*");
51
52 private static final Pattern FILE_DOCUMENT_PATTERN = Pattern.compile("^file:(.*)$");
53
54 @Parameter(arity = 1, description = "Input document URL, {http://path/to/resource.html|file:/path/to/localFile.html}", converter = MicrodataParserDocumentSourceConverter.class)
55 private List<DocumentSource> document = new LinkedList<DocumentSource>();
56
57 private PrintStream out = System.out;
58
59 @Override
60 PrintStream getOut() {
61 return out;
62 }
63
64 @Override
65 void setOut(PrintStream out) {
66 this.out = out;
67 }
68
69 public void run() throws Exception {
70 if (document.isEmpty()) {
71 throw new IllegalArgumentException("No input document URL specified");
72 }
73 InputStream documentInputInputStream = null;
74 try {
75 final DocumentSource documentSource = document.get(0);
76 documentInputInputStream = documentSource.openInputStream();
77 final TagSoupParserser.html#TagSoupParser">TagSoupParser tagSoupParser = new TagSoupParser(documentInputInputStream,
78 documentSource.getDocumentIRI());
79 org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), out);
80 } finally {
81 if (documentInputInputStream != null)
82 StreamUtils.closeGracefully(documentInputInputStream);
83 }
84 }
85
86 public static final class MicrodataParserDocumentSourceConverter implements IStringConverter<DocumentSource> {
87
88 @Override
89 public DocumentSource convert(String value) {
90 final Matcher httpMatcher = HTTP_DOCUMENT_PATTERN.matcher(value);
91 if (httpMatcher.find()) {
92 try {
93 return new HTTPDocumentSource(DefaultHTTPClient.createInitializedHTTPClient(), value);
94 } catch (URISyntaxException e) {
95 throw new ParameterException("Invalid source IRI: '" + value + "'");
96 }
97 }
98 final Matcher fileMatcher = FILE_DOCUMENT_PATTERN.matcher(value);
99 if (fileMatcher.find()) {
100 return new FileDocumentSource(new File(fileMatcher.group(1)));
101 }
102 throw new ParameterException("Invalid source protocol: '" + value + "'");
103 }
104
105 }
106
107 }