1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.encoding;
19
20 import org.apache.tika.detect.TextStatistics;
21 import org.apache.tika.utils.CharsetUtils;
22 import org.jsoup.nodes.Element;
23 import org.jsoup.select.Evaluator;
24 import org.jsoup.select.QueryParser;
25 import org.jsoup.select.Selector;
26 import org.rypt.f8.Utf8Statistics;
27
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.nio.charset.Charset;
31 import java.nio.charset.StandardCharsets;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35
36
37
38 class EncodingUtils {
39
40
41
42
43 static String iso_8859_1(InputStream is) throws IOException {
44 StringBuilder chars = new StringBuilder(Math.max(is.available(), 8192));
45 byte[] buffer = new byte[8192];
46 int n;
47 while ((n = is.read(buffer)) != -1) {
48 for (int i = 0; i < n; i++) {
49 chars.append((char) (buffer[i] & 0xFF));
50 }
51 }
52 return chars.toString();
53 }
54
55
56 static Charset correctVariant(TextStatistics stats, Charset charset) {
57 if (charset == null) {
58 return null;
59 }
60 switch (charset.name()) {
61
62 case "ISO-8859-1":
63
64
65
66
67 if ((stats.count('\r') != 0 || hasC1Control(stats)) && hasNoneOf(stats, windows1252Illegals)) {
68 try {
69 return forName("windows-1252");
70 } catch (Exception e) {
71
72 }
73 }
74 return iso_8859_1_or_15(stats);
75 case "windows-1252":
76 return hasNoneOf(stats, windows1252Illegals) ? charset : iso_8859_1_or_15(stats);
77
78
79 case "ISO-8859-2":
80
81
82 if (hasC1Control(stats) && hasNoneOf(stats, windows1250Illegals)) {
83 try {
84 return forName("windows-1250");
85 } catch (Exception e) {
86
87 }
88 }
89 return charset;
90 case "windows-1250":
91 return hasNoneOf(stats, windows1250Illegals) ? charset : charset("ISO-8859-2");
92
93
94 case "ISO-8859-7":
95
96
97 if (hasC1Control(stats) && hasNoneOf(stats, windows1253Illegals)) {
98 try {
99 return forName("windows-1253");
100 } catch (Exception e) {
101
102 }
103 }
104 return hasNoneOf(stats, iso_8859_7Illegals) ? charset : null;
105 case "windows-1253":
106 return hasNoneOf(stats, windows1253Illegals) ? charset
107 : hasNoneOf(stats, iso_8859_7Illegals) ? charset("ISO-8859-7") : null;
108
109
110 case "ISO-8859-8":
111 case "ISO-8859-8-I":
112
113
114 if (hasC1Control(stats) && hasNoneOf(stats, windows1255Illegals)) {
115 try {
116 return forName("windows-1255");
117 } catch (Exception e) {
118
119 }
120 }
121 return hasNoneOf(stats, iso_8859_8Illegals) ? charset : null;
122 case "windows-1255":
123 return hasNoneOf(stats, windows1255Illegals) ? charset
124 : hasNoneOf(stats, iso_8859_8Illegals) ? charset("ISO-8859-8") : null;
125
126
127 case "ISO-8859-9":
128
129
130 if (hasC1Control(stats) && hasNoneOf(stats, windows1254Illegals)) {
131 try {
132 return forName("windows-1254");
133 } catch (Exception e) {
134
135 }
136 }
137 return charset;
138 case "windows-1254":
139 return hasNoneOf(stats, windows1254Illegals) ? charset : charset("ISO-8859-9");
140
141
142 case "windows-1251":
143 return hasNoneOf(stats, windows1251Illegals) ? charset : null;
144 case "ISO-8859-6":
145 return hasNoneOf(stats, iso_8859_6Illegals) ? charset : null;
146 default:
147 return charset;
148 }
149 }
150
151 private static Charset iso_8859_1_or_15(TextStatistics stats) {
152
153
154 if (stats.count(0xa4) != 0) {
155 try {
156 return forName("ISO-8859-15");
157 } catch (Exception e) {
158
159 }
160 }
161 return StandardCharsets.ISO_8859_1;
162 }
163
164 private static final int[] windows1252Illegals = { 0x81, 0x8D, 0x8F, 0x90, 0x9D };
165 private static final int[] windows1250Illegals = { 0x81, 0x83, 0x88, 0x90, 0x98 };
166 private static final int[] iso_8859_7Illegals = { 0xAE, 0xD2, 0xFF };
167 private static final int[] windows1253Illegals = { 0x81, 0x88, 0x8A, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x98, 0x9A, 0x9C,
168 0x9D, 0x9E, 0x9F, 0xAA, 0xD2, 0xFF };
169
170 private static final int[] windows1255Illegals = { 0x81, 0x8A, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x9A, 0x9C, 0x9D, 0x9E,
171 0x9F, 0xCA, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xFB, 0xFC, 0xFF };
172
173 private static final int[] iso_8859_8Illegals = { 0xA1, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8,
174 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA,
175 0xDB, 0xDC, 0xDD, 0xDE, 0xFB, 0xFC, 0xFF };
176
177 private static final int[] windows1254Illegals = { 0x81, 0x8D, 0x8E, 0x8F, 0x90, 0x9D, 0x9E };
178
179 private static final int[] windows1251Illegals = { 0x98 };
180
181 private static final int[] iso_8859_6Illegals = { 0xA1, 0xA2, 0xA3, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAE,
182 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBC, 0xBD, 0xBE, 0xC0, 0xDB, 0xDC,
183 0xDD, 0xDE, 0xDF, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF };
184
185 private static boolean hasNoneOf(TextStatistics stats, int[] illegals) {
186 for (int i : illegals) {
187 if (stats.count(i) != 0) {
188 return false;
189 }
190 }
191 return true;
192 }
193
194 private static boolean hasC1Control(TextStatistics ts) {
195 for (int i = 0x80; i < 0xA0; i++) {
196 if (ts.count(i) != 0) {
197 return true;
198 }
199 }
200 return false;
201 }
202
203 private static class TextStatisticsOptimizedForUtf8 extends TextStatistics {
204
205 private final Utf8Statistics utf8Stats = new Utf8Statistics();
206
207 @Override
208 public void addData(byte[] buffer, int offset, int length) {
209 super.addData(buffer, offset, length);
210 utf8Stats.write(buffer, offset, length);
211 }
212
213 @Override
214 public boolean looksLikeUTF8() {
215 return utf8Stats.looksLikeUtf8();
216 }
217 }
218
219
220
221
222 static TextStatistics stats(InputStream stream) throws IOException {
223 TextStatisticsOptimizedForUtf8 stats = new TextStatisticsOptimizedForUtf8();
224 byte[] buffer = new byte[8192];
225 int n;
226 while ((n = stream.read(buffer)) != -1) {
227 stats.addData(buffer, 0, n);
228 }
229 return stats;
230 }
231
232 static Charset forName(String charset) throws Exception {
233 try {
234 return CharsetUtils.forName(charset);
235 } catch (Exception e) {
236
237
238
239
240
241
242 charset = charset.replaceAll("(?i)-I\\b", "");
243 try {
244 return CharsetUtils.forName(charset);
245 } catch (Exception ignored) {
246 throw e;
247 }
248 }
249 }
250
251 private static Charset charset(String charset) {
252 try {
253 return forName(charset);
254 } catch (Exception e) {
255 return null;
256 }
257 }
258
259 private static final Evaluator charsetMetas = QueryParser.parse("meta[http-equiv=content-type], meta[charset]");
260
261 static Charset htmlCharset(TextStatistics stats, Element root) {
262 for (Element meta : Selector.select(charsetMetas, root)) {
263 Charset foundCharset = correctVariant(stats, charset(meta.attr("charset")));
264 if (foundCharset != null) {
265 return foundCharset;
266 }
267 foundCharset = correctVariant(stats, contentTypeCharset(meta.attr("content")));
268 if (foundCharset != null) {
269 return foundCharset;
270 }
271 }
272 return null;
273 }
274
275 private static final Pattern contentTypeCharsetPattern = Pattern
276 .compile("(?i)\\bcharset\\s*=[\\s\"']*([^\\s,;\"']+)");
277
278 static Charset contentTypeCharset(CharSequence contentType) {
279 if (contentType == null)
280 return null;
281 Matcher m = contentTypeCharsetPattern.matcher(contentType);
282 if (m.find()) {
283 try {
284 return forName(m.group(1));
285 } catch (Exception e) {
286 return null;
287 }
288 }
289 return null;
290 }
291
292 private static final Pattern xmlEncoding = Pattern
293 .compile("(?is)\\A\\s*<\\?\\s*xml\\s+[^<>]*encoding\\s*=\\s*(?:['\"]\\s*)?([-_:.a-z0-9]+)");
294
295 static Charset xmlCharset(TextStatistics stats, CharSequence str) {
296 Matcher matcher = xmlEncoding.matcher(str);
297 if (matcher.find()) {
298 return correctVariant(stats, charset(matcher.group(1)));
299 } else {
300 return null;
301 }
302 }
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334 }