1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.encoding;
19
20 import org.apache.tika.detect.TextStatistics;
21 import org.apache.tika.parser.txt.CharsetDetector;
22 import org.apache.tika.parser.txt.CharsetMatch;
23 import org.jsoup.nodes.Comment;
24 import org.jsoup.nodes.DataNode;
25 import org.jsoup.nodes.Document;
26 import org.jsoup.nodes.DocumentType;
27 import org.jsoup.nodes.Element;
28 import org.jsoup.nodes.Node;
29 import org.jsoup.nodes.PseudoTextElement;
30 import org.jsoup.nodes.TextNode;
31 import org.jsoup.parser.ParseError;
32 import org.jsoup.parser.ParseErrorList;
33 import org.jsoup.parser.Parser;
34 import org.jsoup.select.NodeTraversor;
35 import org.jsoup.select.NodeVisitor;
36
37 import java.io.BufferedInputStream;
38 import java.io.IOException;
39 import java.io.InputStream;
40 import java.nio.charset.Charset;
41
42 import static java.nio.charset.StandardCharsets.UTF_8;
43 import static java.nio.charset.StandardCharsets.ISO_8859_1;
44
45
46
47
48
49
50
51
52
53
54 public class TikaEncodingDetector implements EncodingDetector {
55
56 @Override
57 public String guessEncoding(InputStream input) throws IOException {
58 return guessEncoding(input, (String) null);
59 }
60
61 private static final String TAG_CHARS = "< />";
62 private static final byte[] TAG_BYTES = TAG_CHARS.getBytes(UTF_8);
63 private static final Node[] EMPTY_NODES = new Node[0];
64
65 private static Charset guessEncoding(InputStream is, Charset declared) throws IOException {
66 if (!is.markSupported()) {
67 is = new BufferedInputStream(is);
68 }
69
70 TextStatistics stats = computeAndReset(is, EncodingUtils::stats);
71
72
73 if (stats.looksLikeUTF8()) {
74
75
76
77 return UTF_8;
78 }
79
80 declared = EncodingUtils.correctVariant(stats, declared);
81 if (declared != null) {
82 return declared;
83 }
84
85
86
87 String iso_8859_1 = computeAndReset(is, EncodingUtils::iso_8859_1);
88
89 Charset xmlCharset = EncodingUtils.xmlCharset(stats, iso_8859_1);
90 if (xmlCharset != null) {
91 return xmlCharset;
92 }
93
94 ParseErrorList htmlErrors = ParseErrorList.tracking(Integer.MAX_VALUE);
95 Document doc = parseFragment(iso_8859_1, htmlErrors);
96
97 Charset htmlCharset = EncodingUtils.htmlCharset(stats, doc);
98
99 if (htmlCharset != null) {
100 return htmlCharset;
101 }
102
103 if (stats.countEightBit() == 0) {
104
105 return UTF_8;
106 }
107
108
109
110
111 long openTags = countTags(doc);
112 long badTags = htmlErrors.stream().map(ParseError::getErrorMessage)
113 .filter(err -> err != null && err.matches(".*'[</>]'.*")).count();
114
115
116 boolean filterInput = true;
117 if (openTags < 5 || openTags / 5 < badTags) {
118 filterInput = false;
119 } else {
120 String wholeText = wholeText(doc);
121 if (wholeText.length() < 100 && iso_8859_1.length() > 600) {
122 filterInput = false;
123 } else {
124 iso_8859_1 = wholeText;
125 }
126 }
127 byte[] text = iso_8859_1.getBytes(ISO_8859_1);
128
129 CharsetDetector icu4j = new CharsetDetector(text.length);
130 icu4j.setText(text);
131
132 for (CharsetMatch match : icu4j.detectAll()) {
133 try {
134 Charset charset = EncodingUtils.forName(match.getName());
135
136
137
138
139 if (filterInput && !TAG_CHARS.equals(new String(TAG_BYTES, charset))) {
140 continue;
141 }
142
143 charset = EncodingUtils.correctVariant(stats, charset);
144 if (charset != null) {
145 return charset;
146 }
147 } catch (Exception e) {
148
149 }
150 }
151
152
153
154 return EncodingUtils.correctVariant(stats, ISO_8859_1);
155 }
156
157 @Override
158 public String guessEncoding(InputStream is, String contentType) throws IOException {
159 Charset charset = EncodingUtils.contentTypeCharset(contentType);
160 return guessEncoding(is, charset).name();
161 }
162
163
164
165
166
167 @FunctionalInterface
168 private interface InputStreamFunction<E> {
169 E compute(InputStream is) throws IOException;
170 }
171
172 private static <E> E computeAndReset(InputStream is, InputStreamFunction<E> function) throws IOException {
173 is.mark(Integer.MAX_VALUE);
174 try {
175 return function.compute(is);
176 } finally {
177 is.reset();
178 }
179 }
180
181 private static Document parseFragment(String html, ParseErrorList errors) {
182 Document doc = new Document("");
183 Node[] childNodes = Parser.parseFragment(html, null, "", errors).toArray(EMPTY_NODES);
184 for (Node node : childNodes) {
185 if (node.parentNode() != null) {
186 node.remove();
187 }
188 doc.appendChild(node);
189 }
190 return doc;
191 }
192
193 private static long countTags(Node node) {
194 long[] ret = { 0 };
195 NodeTraversor.traverse(new NodeVisitor() {
196 @Override
197 public void head(Node node, int depth) {
198 if (node instanceof Document || node instanceof PseudoTextElement) {
199
200 return;
201 }
202 if (node instanceof Element || node instanceof DocumentType || node instanceof Comment) {
203 ret[0] += node.childNodeSize() == 0 ? 1 : 2;
204 }
205 }
206
207 @Override
208 public void tail(Node node, int depth) {
209 }
210 }, node);
211 return ret[0];
212 }
213
214 private static String wholeText(Node node) {
215 StringBuilder sb = new StringBuilder();
216 NodeTraversor.traverse(new NodeVisitor() {
217 @Override
218 public void head(Node node, int depth) {
219 if (node instanceof TextNode) {
220 sb.append(((TextNode) node).getWholeText());
221 } else if (node instanceof DataNode) {
222 String data = ((DataNode) node).getWholeData();
223 do {
224
225
226 if ("script".equalsIgnoreCase(node.nodeName())) {
227 if (node.attr("type").toLowerCase(java.util.Locale.ROOT).contains("json")) {
228 sb.append(data);
229 }
230 break;
231 } else if ("style".equalsIgnoreCase(node.nodeName())) {
232 break;
233 }
234 node = node.parentNode();
235 } while (node != null);
236 } else if (node instanceof Comment) {
237 String data = ((Comment) node).getData();
238
239 if (!data.contains("<!") && !data.contains("<?")) {
240 sb.append(data);
241 }
242 } else if (node instanceof Element) {
243
244 sb.append(node.attr("content"));
245 }
246 }
247
248 @Override
249 public void tail(Node node, int depth) {
250 }
251 }, node);
252 return sb.toString();
253 }
254
255 }