View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.mime;
19  
20  import org.apache.any23.extractor.csv.CSVReaderBuilder;
21  import org.apache.any23.mime.purifier.Purifier;
22  import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
23  import org.apache.tika.Tika;
24  import org.apache.tika.config.TikaConfig;
25  import org.apache.tika.metadata.Metadata;
26  import org.apache.tika.metadata.TikaCoreProperties;
27  import org.apache.tika.mime.MimeType;
28  import org.apache.tika.mime.MimeTypeException;
29  import org.apache.tika.mime.MimeTypes;
30  import org.eclipse.rdf4j.rio.RDFFormat;
31  import org.eclipse.rdf4j.rio.RDFParser;
32  import org.eclipse.rdf4j.rio.Rio;
33  import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
34  
35  import java.io.BufferedReader;
36  import java.io.ByteArrayInputStream;
37  import java.io.IOException;
38  import java.io.InputStream;
39  import java.io.InputStreamReader;
40  import java.nio.charset.StandardCharsets;
41  import java.util.regex.Pattern;
42  
43  /**
44   * Implementation of {@link MIMETypeDetector} based on <a href="http://tika.apache.org/">Apache Tika</a>.
45   *
46   * @author Michele Mostarda (michele.mostarda@gmail.com)
47   * @author Davide Palmisano (dpalmisano@gmail.com)
48   */
49  public class TikaMIMETypeDetector implements MIMETypeDetector {
50  
51      private Purifier purifier;
52  
53      public static final String CSV_MIMETYPE = "text/csv";
54  
55      public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml";
56  
57      /**
58       * N3 patterns.
59       */
60      private static final Pattern[] N3_PATTERNS = { Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\."), // * IRI IRI .
61              Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\."), // * IRI BNODE .
62              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\."), // * IRI LLITERAL .
63              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.") // * IRI TLITERAL .
64      };
65  
66      /**
67       * N-Quads patterns.
68       */
69      private static final Pattern[] NQUADS_PATTERNS = { Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\."), // *
70                                                                                                                          // IRI
71                                                                                                                          // IRI
72                                                                                                                          // IRI
73                                                                                                                          // .
74              Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\."), // * IRI BNODE IRI .
75              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\."), // * IRI LLITERAL IRI .
76              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.") // * IRI TLITERAL IRI .
77      };
78  
79      private static volatile TikaConfig config;
80  
81      private static volatile Tika tika;
82  
83      private static volatile MimeTypes types;
84  
85      /**
86       * Checks if the stream contains the <i>N3</i> triple patterns.
87       *
88       * @param is
89       *            input stream to be verified.
90       * 
91       * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
92       * 
93       * @throws IOException
94       *             if there is an error checking the {@link java.io.InputStream}
95       */
96      public static boolean checkN3Format(InputStream is) throws IOException {
97          return findPattern(N3_PATTERNS, '.', is);
98      }
99  
100     /**
101      * Checks if the stream contains the <i>NQuads</i> patterns.
102      *
103      * @param is
104      *            input stream to be verified.
105      * 
106      * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
107      * 
108      * @throws IOException
109      *             if there is an error checking the {@link java.io.InputStream}
110      */
111     public static boolean checkNQuadsFormat(InputStream is) throws IOException {
112         return findPattern(NQUADS_PATTERNS, '.', is);
113     }
114 
115     /**
116      * Checks if the stream contains <i>Turtle</i> triple patterns.
117      *
118      * @param is
119      *            input stream to be verified.
120      * 
121      * @return <code>true</code> if <i>Turtle</i> patterns are detected, <code>false</code> otherwise.
122      * 
123      * @throws IOException
124      *             if there is an error checking the {@link java.io.InputStream}
125      */
126     public static boolean checkTurtleFormat(InputStream is) throws IOException {
127         String sample = extractDataSample(is, '.');
128         RDFParser turtleParser = Rio.createParser(RDFFormat.TURTLE);
129         turtleParser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
130         ByteArrayInputStream bais = new ByteArrayInputStream(sample.getBytes(StandardCharsets.UTF_8));
131         try {
132             turtleParser.parse(bais, "");
133             return true;
134         } catch (Exception e) {
135             return false;
136         }
137     }
138 
139     /**
140      * Checks if the stream contains a valid <i>CSV</i> pattern.
141      *
142      * @param is
143      *            input stream to be verified.
144      * 
145      * @return <code>true</code> if <i>CSV</i> patterns are detected, <code>false</code> otherwise.
146      * 
147      * @throws IOException
148      *             if there is an error checking the {@link java.io.InputStream}
149      */
150     public static boolean checkCSVFormat(InputStream is) throws IOException {
151         return CSVReaderBuilder.isCSV(is);
152     }
153 
154     /**
155      * Tries to apply one of the given patterns on a sample of the input stream.
156      *
157      * @param patterns
158      *            the patterns to apply.
159      * @param delimiterChar
160      *            the delimiter of the sample.
161      * @param is
162      *            the input stream to sample.
163      * 
164      * @return <code>true</code> if a pattern has been applied, <code>false</code> otherwise.
165      * 
166      * @throws IOException
167      *             if there is an error finding the pattern within the {@link java.io.InputStream}
168      */
169     private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is) throws IOException {
170         String sample = extractDataSample(is, delimiterChar);
171         for (Pattern pattern : patterns) {
172             if (pattern.matcher(sample).find()) {
173                 return true;
174             }
175         }
176         return false;
177     }
178 
179     /**
180      * Extracts a sample data from the input stream, from the current mark to the first <i>breakChar</i> char.
181      *
182      * @param is
183      *            the input stream to sample.
184      * @param breakChar
185      *            the char to break to sample.
186      * 
187      * @return the sample string.
188      * 
189      * @throws IOException
190      *             if an error occurs during sampling.
191      */
192     private static String extractDataSample(InputStream is, char breakChar) throws IOException {
193         BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
194         StringBuilder sb = new StringBuilder();
195         final int MAX_SIZE = 1024 * 2;
196         int c;
197         boolean insideBlock = false;
198         int read = 0;
199         br.mark(MAX_SIZE);
200         try {
201             while ((c = br.read()) != -1) {
202                 read++;
203                 if (read > MAX_SIZE) {
204                     break;
205                 }
206                 if ('<' == c) {
207                     insideBlock = true;
208                 } else if ('>' == c) {
209                     insideBlock = false;
210                 } else if ('"' == c) {
211                     insideBlock = !insideBlock;
212                 }
213                 sb.append((char) c);
214                 if (!insideBlock && breakChar == c) {
215                     break;
216                 }
217             }
218         } finally {
219             is.reset();
220             br.reset();
221         }
222         return sb.toString();
223     }
224 
225     public TikaMIMETypeDetector(Purifier purifier) {
226         this.purifier = purifier;
227         if (config == null || types == null || tika == null) {
228             synchronized (TikaMIMETypeDetector.class) {
229                 if (config == null) {
230                     InputStream is = getResourceAsStream();
231                     try {
232                         config = new TikaConfig(is);
233                     } catch (Exception e) {
234                         throw new RuntimeException("Error while loading Tika configuration.", e);
235                     }
236                 }
237                 if (types == null) {
238                     types = config.getMimeRepository();
239                 }
240                 if (tika == null) {
241                     tika = new Tika(config);
242                 }
243             }
244         }
245     }
246 
247     public TikaMIMETypeDetector() {
248         this(new WhiteSpacesPurifier());
249     }
250 
251     /**
252      * Estimates the <code>MIME</code> type of the content of input file. The <i>input</i> stream must be resettable.
253      *
254      * @param fileName
255      *            name of the data source.
256      * @param input
257      *            <code>null</code> or a <i>resettable</i> input stream containing data.
258      * @param mimeTypeFromMetadata
259      *            mimetype declared in metadata.
260      * 
261      * @return the supposed mime type or <code>null</code> if nothing appropriate found.
262      * 
263      * @throws IllegalArgumentException
264      *             if <i>input</i> is not <code>null</code> and is not resettable.
265      */
266     public MIMEType guessMIMEType(String fileName, InputStream input, MIMEType mimeTypeFromMetadata) {
267         if (input != null) {
268             try {
269                 this.purifier.purify(input);
270             } catch (IOException e) {
271                 throw new RuntimeException("Error while purifying the provided input", e);
272             }
273         }
274 
275         final Metadata meta = new Metadata();
276         if (mimeTypeFromMetadata != null)
277             meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType());
278         if (fileName != null)
279             meta.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
280 
281         String type;
282         try {
283             final String mt = guessMimeTypeByInputAndMeta(input, meta);
284             if (input == null || !MimeTypes.OCTET_STREAM.equals(mt)) {
285                 type = mt;
286             } else {
287                 if (checkN3Format(input)) {
288                     type = RDFFormat.N3.getDefaultMIMEType();
289                 } else if (checkNQuadsFormat(input)) {
290                     type = RDFFormat.NQUADS.getDefaultMIMEType();
291                 } else if (checkTurtleFormat(input)) {
292                     type = RDFFormat.TURTLE.getDefaultMIMEType();
293                 } else if (checkCSVFormat(input)) {
294                     type = CSV_MIMETYPE;
295                 } else {
296                     type = MimeTypes.OCTET_STREAM;
297                 }
298             }
299         } catch (IOException ioe) {
300             throw new RuntimeException("Error while retrieving mime type.", ioe);
301         }
302         return MIMEType.parse(type);
303     }
304 
305     /**
306      * Loads the <code>Tika</code> configuration file.
307      *
308      * @return the input stream containing the configuration.
309      */
310     private InputStream getResourceAsStream() {
311         InputStream result;
312         result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME);
313         if (result == null) {
314             try {
315                 result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME);
316             } catch (SecurityException e) {
317                 // fall through
318             }
319             if (result == null) {
320                 result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME);
321             }
322         }
323         return result;
324     }
325 
326     /**
327      * Automatically detects the MIME type of a document based on magic markers in the stream prefix and any given
328      * metadata hints.
329      * <p/>
330      * The given stream is expected to support marks, so that this method can reset the stream to the position it was in
331      * before this method was called.
332      *
333      * @param stream
334      *            document stream
335      * @param metadata
336      *            metadata hints
337      * 
338      * @return MIME type of the document
339      * 
340      * @throws IOException
341      *             if the document stream could not be read
342      */
343     private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata) throws IOException {
344         if (stream != null) {
345             final String type = tika.detect(stream);
346             if (type != null && !isGenericMIMEType(type)) {
347                 return type;
348             }
349         }
350 
351         // Determines the MIMEType based on Content-Type hint if available.
352         final String contentType = metadata.get(Metadata.CONTENT_TYPE);
353         String candidateMIMEType = null;
354         if (contentType != null) {
355             try {
356                 MimeType type = types.forName(contentType);
357                 if (type != null) {
358                     candidateMIMEType = type.getName();
359                     if (!isPlainMIMEType(candidateMIMEType)) {
360                         return candidateMIMEType;
361                     }
362                 }
363             } catch (MimeTypeException mte) {
364                 // Malformed ocntent-type value, ignore.
365             }
366         }
367 
368         // Determines the MIMEType based on resource name hint if available.
369         final String resourceName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
370         if (resourceName != null) {
371             String type = tika.detect(resourceName);
372             if (type != null && !type.equals(MimeTypes.OCTET_STREAM)) {
373                 return type;
374             }
375         }
376 
377         // Finally, use the default type if no matches found
378         if (candidateMIMEType != null) {
379             return candidateMIMEType;
380         } else {
381             return MimeTypes.OCTET_STREAM;
382         }
383     }
384 
385     private boolean isPlainMIMEType(String type) {
386         return type.equals(MimeTypes.OCTET_STREAM) || type.equals(MimeTypes.PLAIN_TEXT);
387     }
388 
389     private boolean isGenericMIMEType(String type) {
390         return isPlainMIMEType(type) || type.equals(MimeTypes.XML);
391     }
392 
393 }