1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.mime;
19
20 import org.apache.any23.extractor.csv.CSVReaderBuilder;
21 import org.apache.any23.mime.purifier.Purifier;
22 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
23 import org.apache.tika.Tika;
24 import org.apache.tika.config.TikaConfig;
25 import org.apache.tika.metadata.Metadata;
26 import org.apache.tika.mime.MimeType;
27 import org.apache.tika.mime.MimeTypeException;
28 import org.apache.tika.mime.MimeTypes;
29 import org.eclipse.rdf4j.rio.RDFFormat;
30 import org.eclipse.rdf4j.rio.RDFParser;
31 import org.eclipse.rdf4j.rio.Rio;
32 import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
33
34 import java.io.BufferedReader;
35 import java.io.ByteArrayInputStream;
36 import java.io.IOException;
37 import java.io.InputStream;
38 import java.io.InputStreamReader;
39 import java.nio.charset.StandardCharsets;
40 import java.util.regex.Pattern;
41
42
43
44
45
46
47
48 public class TikaMIMETypeDetector implements MIMETypeDetector {
49
50 private Purifier purifier;
51
52 public static final String CSV_MIMETYPE = "text/csv";
53
54 public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml";
55
56
57
58
59 private static final Pattern[] N3_PATTERNS = { Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\."),
60 Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\."),
61 Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\."),
62 Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.")
63 };
64
65
66
67
68 private static final Pattern[] NQUADS_PATTERNS = { Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\."),
69
70
71
72
73 Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\."),
74 Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\."),
75 Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.")
76 };
77
78 private static volatile TikaConfig config;
79
80 private static volatile Tika tika;
81
82 private static volatile MimeTypes types;
83
84
85
86
87
88
89
90
91
92
93
94
95 public static boolean checkN3Format(InputStream is) throws IOException {
96 return findPattern(N3_PATTERNS, '.', is);
97 }
98
99
100
101
102
103
104
105
106
107
108
109
110 public static boolean checkNQuadsFormat(InputStream is) throws IOException {
111 return findPattern(NQUADS_PATTERNS, '.', is);
112 }
113
114
115
116
117
118
119
120
121
122
123
124
125 public static boolean checkTurtleFormat(InputStream is) throws IOException {
126 String sample = extractDataSample(is, '.');
127 RDFParser turtleParser = Rio.createParser(RDFFormat.TURTLE);
128 turtleParser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
129 ByteArrayInputStream bais = new ByteArrayInputStream(sample.getBytes(StandardCharsets.UTF_8));
130 try {
131 turtleParser.parse(bais, "");
132 return true;
133 } catch (Exception e) {
134 return false;
135 }
136 }
137
138
139
140
141
142
143
144
145
146
147
148
149 public static boolean checkCSVFormat(InputStream is) throws IOException {
150 return CSVReaderBuilder.isCSV(is);
151 }
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168 private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is) throws IOException {
169 String sample = extractDataSample(is, delimiterChar);
170 for (Pattern pattern : patterns) {
171 if (pattern.matcher(sample).find()) {
172 return true;
173 }
174 }
175 return false;
176 }
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191 private static String extractDataSample(InputStream is, char breakChar) throws IOException {
192 BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
193 StringBuilder sb = new StringBuilder();
194 final int MAX_SIZE = 1024 * 2;
195 int c;
196 boolean insideBlock = false;
197 int read = 0;
198 br.mark(MAX_SIZE);
199 try {
200 while ((c = br.read()) != -1) {
201 read++;
202 if (read > MAX_SIZE) {
203 break;
204 }
205 if ('<' == c) {
206 insideBlock = true;
207 } else if ('>' == c) {
208 insideBlock = false;
209 } else if ('"' == c) {
210 insideBlock = !insideBlock;
211 }
212 sb.append((char) c);
213 if (!insideBlock && breakChar == c) {
214 break;
215 }
216 }
217 } finally {
218 is.reset();
219 br.reset();
220 }
221 return sb.toString();
222 }
223
224 public TikaMIMETypeDetector(Purifier purifier) {
225 this.purifier = purifier;
226 if (config == null || types == null || tika == null) {
227 synchronized (TikaMIMETypeDetector.class) {
228 if (config == null) {
229 InputStream is = getResourceAsStream();
230 try {
231 config = new TikaConfig(is);
232 } catch (Exception e) {
233 throw new RuntimeException("Error while loading Tika configuration.", e);
234 }
235 }
236 if (types == null) {
237 types = config.getMimeRepository();
238 }
239 if (tika == null) {
240 tika = new Tika(config);
241 }
242 }
243 }
244 }
245
246 public TikaMIMETypeDetector() {
247 this(new WhiteSpacesPurifier());
248 }
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265 public MIMEType.html#MIMEType">MIMEType guessMIMEType(String fileName, InputStream input, MIMEType mimeTypeFromMetadata) {
266 if (input != null) {
267 try {
268 this.purifier.purify(input);
269 } catch (IOException e) {
270 throw new RuntimeException("Error while purifying the provided input", e);
271 }
272 }
273
274 final Metadata meta = new Metadata();
275 if (mimeTypeFromMetadata != null)
276 meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType());
277 if (fileName != null)
278 meta.set(Metadata.RESOURCE_NAME_KEY, fileName);
279
280 String type;
281 try {
282 final String mt = guessMimeTypeByInputAndMeta(input, meta);
283 if (input == null || !MimeTypes.OCTET_STREAM.equals(mt)) {
284 type = mt;
285 } else {
286 if (checkN3Format(input)) {
287 type = RDFFormat.N3.getDefaultMIMEType();
288 } else if (checkNQuadsFormat(input)) {
289 type = RDFFormat.NQUADS.getDefaultMIMEType();
290 } else if (checkTurtleFormat(input)) {
291 type = RDFFormat.TURTLE.getDefaultMIMEType();
292 } else if (checkCSVFormat(input)) {
293 type = CSV_MIMETYPE;
294 } else {
295 type = MimeTypes.OCTET_STREAM;
296 }
297 }
298 } catch (IOException ioe) {
299 throw new RuntimeException("Error while retrieving mime type.", ioe);
300 }
301 return MIMEType.parse(type);
302 }
303
304
305
306
307
308
309 private InputStream getResourceAsStream() {
310 InputStream result;
311 result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME);
312 if (result == null) {
313 try {
314 result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME);
315 } catch (SecurityException e) {
316
317 }
318 if (result == null) {
319 result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME);
320 }
321 }
322 return result;
323 }
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342 private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata) throws IOException {
343 if (stream != null) {
344 final String type = tika.detect(stream);
345 if (type != null && !isGenericMIMEType(type)) {
346 return type;
347 }
348 }
349
350
351 final String contentType = metadata.get(Metadata.CONTENT_TYPE);
352 String candidateMIMEType = null;
353 if (contentType != null) {
354 try {
355 MimeType type = types.forName(contentType);
356 if (type != null) {
357 candidateMIMEType = type.getName();
358 if (!isPlainMIMEType(candidateMIMEType)) {
359 return candidateMIMEType;
360 }
361 }
362 } catch (MimeTypeException mte) {
363
364 }
365 }
366
367
368 final String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
369 if (resourceName != null) {
370 String type = tika.detect(resourceName);
371 if (type != null && !type.equals(MimeTypes.OCTET_STREAM)) {
372 return type;
373 }
374 }
375
376
377 if (candidateMIMEType != null) {
378 return candidateMIMEType;
379 } else {
380 return MimeTypes.OCTET_STREAM;
381 }
382 }
383
384 private boolean isPlainMIMEType(String type) {
385 return type.equals(MimeTypes.OCTET_STREAM) || type.equals(MimeTypes.PLAIN_TEXT);
386 }
387
388 private boolean isGenericMIMEType(String type) {
389 return isPlainMIMEType(type) || type.equals(MimeTypes.XML);
390 }
391
392 }