1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.validator.DefaultValidator;
21 import org.apache.any23.validator.Validator;
22 import org.apache.any23.validator.ValidatorException;
23 import org.slf4j.Logger;
24 import org.slf4j.LoggerFactory;
25 import org.w3c.dom.Document;
26
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.net.URI;
30 import java.net.URISyntaxException;
31 import java.nio.charset.Charset;
32 import java.nio.charset.UnsupportedCharsetException;
33 import java.util.Locale;
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 public class TagSoupParser {
53
54 public static final String ELEMENT_LOCATION = "Element-Location";
55
56 private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);
57
58 private final InputStream input;
59
60 private final String documentIRI;
61
62 private final String encoding;
63
64 private final TagSoupParsingConfiguration config;
65
66 private Document result = null;
67
68 public TagSoupParser(InputStream input, String documentIRI) {
69 this.input = input;
70 this.documentIRI = documentIRI;
71 this.encoding = null;
72
73 config = TagSoupParsingConfiguration.getDefault();
74 }
75
76 public TagSoupParser(InputStream input, String documentIRI, String encoding) {
77 if (encoding != null && !Charset.isSupported(encoding))
78 throw new UnsupportedCharsetException(String.format(Locale.ROOT, "Charset %s is not supported", encoding));
79
80 this.input = input;
81 this.documentIRI = documentIRI;
82 this.encoding = encoding;
83
84 config = TagSoupParsingConfiguration.getDefault();
85 }
86
87
88
89
90
91
92
93
94
95 public Document getDOM() throws IOException {
96 if (result == null) {
97 long startTime = System.currentTimeMillis();
98 try {
99 result = config.parse(input, documentIRI, encoding);
100 } finally {
101 long elapsed = System.currentTimeMillis() - startTime;
102 logger.debug("Parsed " + documentIRI + " with " + config.name() + ", " + elapsed + "ms");
103 }
104 }
105 result.setDocumentURI(documentIRI);
106 return result;
107 }
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124 public DocumentReport getValidatedDOM(boolean applyFix) throws IOException, ValidatorException {
125 final URI dIRI;
126 try {
127 dIRI = new URI(documentIRI);
128 } catch (IllegalArgumentException | URISyntaxException urise) {
129 throw new ValidatorException("Error while performing validation, invalid document IRI.", urise);
130 }
131 Validator validator = new DefaultValidator();
132 Document document = getDOM();
133 return new DocumentReport(validator.validate(dIRI, document, applyFix), document);
134 }
135
136
137
138
139 public static class ElementLocation {
140
141 private int beginLineNumber;
142 private int beginColumnNumber;
143 private int endLineNumber;
144 private int endColumnNumber;
145
146 private ElementLocation(int beginLineNumber, int beginColumnNumber, int endLineNumber, int endColumnNumber) {
147 this.beginLineNumber = beginLineNumber;
148 this.beginColumnNumber = beginColumnNumber;
149 this.endLineNumber = endLineNumber;
150 this.endColumnNumber = endColumnNumber;
151 }
152
153 public int getBeginLineNumber() {
154 return beginLineNumber;
155 }
156
157 public int getBeginColumnNumber() {
158 return beginColumnNumber;
159 }
160
161 public int getEndLineNumber() {
162 return endLineNumber;
163 }
164
165 public int getEndColumnNumber() {
166 return endColumnNumber;
167 }
168 }
169
170 }