1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.mime;
19
20 import org.apache.any23.extractor.csv.CSVReaderBuilder;
21 import org.apache.any23.mime.purifier.Purifier;
22 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
23 import org.apache.tika.Tika;
24 import org.apache.tika.config.TikaConfig;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.metadata.TikaCoreProperties;
27 import org.apache.tika.mime.MimeType;
28 import org.apache.tika.mime.MimeTypeException;
29 import org.apache.tika.mime.MimeTypes;
30 import org.eclipse.rdf4j.rio.RDFFormat;
31 import org.eclipse.rdf4j.rio.RDFParser;
32 import org.eclipse.rdf4j.rio.Rio;
33 import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
34
35 import java.io.BufferedReader;
36 import java.io.ByteArrayInputStream;
37 import java.io.IOException;
38 import java.io.InputStream;
39 import java.io.InputStreamReader;
40 import java.nio.charset.StandardCharsets;
41 import java.util.regex.Pattern;
42
43
44
45
46
47
48
49 public class TikaMIMETypeDetector implements MIMETypeDetector {
50
51 private Purifier purifier;
52
53 public static final String CSV_MIMETYPE = "text/csv";
54
55 public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml";
56
57
58
59
60 private static final Pattern[] N3_PATTERNS = { Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\."),
61 Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\."),
62 Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\."),
63 Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.")
64 };
65
66
67
68
69 private static final Pattern[] NQUADS_PATTERNS = { Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\."),
70
71
72
73
74 Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\."),
75 Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\."),
76 Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.")
77 };
78
79 private static volatile TikaConfig config;
80
81 private static volatile Tika tika;
82
83 private static volatile MimeTypes types;
84
85
86
87
88
89
90
91
92
93
94
95
96 public static boolean checkN3Format(InputStream is) throws IOException {
97 return findPattern(N3_PATTERNS, '.', is);
98 }
99
100
101
102
103
104
105
106
107
108
109
110
111 public static boolean checkNQuadsFormat(InputStream is) throws IOException {
112 return findPattern(NQUADS_PATTERNS, '.', is);
113 }
114
115
116
117
118
119
120
121
122
123
124
125
126 public static boolean checkTurtleFormat(InputStream is) throws IOException {
127 String sample = extractDataSample(is, '.');
128 RDFParser turtleParser = Rio.createParser(RDFFormat.TURTLE);
129 turtleParser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
130 ByteArrayInputStream bais = new ByteArrayInputStream(sample.getBytes(StandardCharsets.UTF_8));
131 try {
132 turtleParser.parse(bais, "");
133 return true;
134 } catch (Exception e) {
135 return false;
136 }
137 }
138
139
140
141
142
143
144
145
146
147
148
149
150 public static boolean checkCSVFormat(InputStream is) throws IOException {
151 return CSVReaderBuilder.isCSV(is);
152 }
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169 private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is) throws IOException {
170 String sample = extractDataSample(is, delimiterChar);
171 for (Pattern pattern : patterns) {
172 if (pattern.matcher(sample).find()) {
173 return true;
174 }
175 }
176 return false;
177 }
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192 private static String extractDataSample(InputStream is, char breakChar) throws IOException {
193 BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
194 StringBuilder sb = new StringBuilder();
195 final int MAX_SIZE = 1024 * 2;
196 int c;
197 boolean insideBlock = false;
198 int read = 0;
199 br.mark(MAX_SIZE);
200 try {
201 while ((c = br.read()) != -1) {
202 read++;
203 if (read > MAX_SIZE) {
204 break;
205 }
206 if ('<' == c) {
207 insideBlock = true;
208 } else if ('>' == c) {
209 insideBlock = false;
210 } else if ('"' == c) {
211 insideBlock = !insideBlock;
212 }
213 sb.append((char) c);
214 if (!insideBlock && breakChar == c) {
215 break;
216 }
217 }
218 } finally {
219 is.reset();
220 br.reset();
221 }
222 return sb.toString();
223 }
224
225 public TikaMIMETypeDetector(Purifier purifier) {
226 this.purifier = purifier;
227 if (config == null || types == null || tika == null) {
228 synchronized (TikaMIMETypeDetector.class) {
229 if (config == null) {
230 InputStream is = getResourceAsStream();
231 try {
232 config = new TikaConfig(is);
233 } catch (Exception e) {
234 throw new RuntimeException("Error while loading Tika configuration.", e);
235 }
236 }
237 if (types == null) {
238 types = config.getMimeRepository();
239 }
240 if (tika == null) {
241 tika = new Tika(config);
242 }
243 }
244 }
245 }
246
247 public TikaMIMETypeDetector() {
248 this(new WhiteSpacesPurifier());
249 }
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266 public MIMEType guessMIMEType(String fileName, InputStream input, MIMEType mimeTypeFromMetadata) {
267 if (input != null) {
268 try {
269 this.purifier.purify(input);
270 } catch (IOException e) {
271 throw new RuntimeException("Error while purifying the provided input", e);
272 }
273 }
274
275 final Metadata meta = new Metadata();
276 if (mimeTypeFromMetadata != null)
277 meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType());
278 if (fileName != null)
279 meta.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
280
281 String type;
282 try {
283 final String mt = guessMimeTypeByInputAndMeta(input, meta);
284 if (input == null || !MimeTypes.OCTET_STREAM.equals(mt)) {
285 type = mt;
286 } else {
287 if (checkN3Format(input)) {
288 type = RDFFormat.N3.getDefaultMIMEType();
289 } else if (checkNQuadsFormat(input)) {
290 type = RDFFormat.NQUADS.getDefaultMIMEType();
291 } else if (checkTurtleFormat(input)) {
292 type = RDFFormat.TURTLE.getDefaultMIMEType();
293 } else if (checkCSVFormat(input)) {
294 type = CSV_MIMETYPE;
295 } else {
296 type = MimeTypes.OCTET_STREAM;
297 }
298 }
299 } catch (IOException ioe) {
300 throw new RuntimeException("Error while retrieving mime type.", ioe);
301 }
302 return MIMEType.parse(type);
303 }
304
305
306
307
308
309
310 private InputStream getResourceAsStream() {
311 InputStream result;
312 result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME);
313 if (result == null) {
314 try {
315 result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME);
316 } catch (SecurityException e) {
317
318 }
319 if (result == null) {
320 result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME);
321 }
322 }
323 return result;
324 }
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343 private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata) throws IOException {
344 if (stream != null) {
345 final String type = tika.detect(stream);
346 if (type != null && !isGenericMIMEType(type)) {
347 return type;
348 }
349 }
350
351
352 final String contentType = metadata.get(Metadata.CONTENT_TYPE);
353 String candidateMIMEType = null;
354 if (contentType != null) {
355 try {
356 MimeType type = types.forName(contentType);
357 if (type != null) {
358 candidateMIMEType = type.getName();
359 if (!isPlainMIMEType(candidateMIMEType)) {
360 return candidateMIMEType;
361 }
362 }
363 } catch (MimeTypeException mte) {
364
365 }
366 }
367
368
369 final String resourceName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
370 if (resourceName != null) {
371 String type = tika.detect(resourceName);
372 if (type != null && !type.equals(MimeTypes.OCTET_STREAM)) {
373 return type;
374 }
375 }
376
377
378 if (candidateMIMEType != null) {
379 return candidateMIMEType;
380 } else {
381 return MimeTypes.OCTET_STREAM;
382 }
383 }
384
385 private boolean isPlainMIMEType(String type) {
386 return type.equals(MimeTypes.OCTET_STREAM) || type.equals(MimeTypes.PLAIN_TEXT);
387 }
388
389 private boolean isGenericMIMEType(String type) {
390 return isPlainMIMEType(type) || type.equals(MimeTypes.XML);
391 }
392
393 }