View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdf;
19  
20  import org.apache.any23.extractor.IssueReport;
21  import org.apache.any23.extractor.ExtractionContext;
22  import org.apache.any23.extractor.ExtractionResult;
23  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
24  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
25  import org.eclipse.rdf4j.rio.ParseErrorListener;
26  import org.eclipse.rdf4j.rio.RDFFormat;
27  import org.eclipse.rdf4j.rio.RDFHandlerException;
28  import org.eclipse.rdf4j.rio.RDFParseException;
29  import org.eclipse.rdf4j.rio.RDFParser;
30  import org.eclipse.rdf4j.rio.Rio;
31  import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
32  import org.eclipse.rdf4j.rio.helpers.RDFaParserSettings;
33  import org.eclipse.rdf4j.rio.helpers.RDFaVersion;
34  import org.eclipse.rdf4j.rio.helpers.XMLParserSettings;
35  import org.eclipse.rdf4j.rio.turtle.TurtleParser;
36  import org.semanticweb.owlapi.rio.OWLAPIRDFFormat;
37  import org.slf4j.Logger;
38  import org.slf4j.LoggerFactory;
39  
40  import java.io.IOException;
41  import java.io.InputStream;
42  import java.io.Reader;
43  import java.util.Collections;
44  import java.util.HashSet;
45  
46  /**
47   * This factory provides a common logic for creating and configuring correctly any <i>RDF</i> parser used within the
48   * library.
49   *
50   * @author Michele Mostarda (mostarda@fbk.eu)
51   */
52  public class RDFParserFactory {
53  
54      private static final Logger logger = LoggerFactory.getLogger(RDFParserFactory.class);
55  
56      private static class InstanceHolder {
57          private static final RDFParserFactory instance = new RDFParserFactory();
58      }
59  
60      public static RDFParserFactory getInstance() {
61          return InstanceHolder.instance;
62      }
63  
64      /**
65       * Returns a new instance of a configured TurtleParser.
66       *
67       * @param verifyDataType
68       *            data verification enable if <code>true</code>.
69       * @param stopAtFirstError
70       *            the parser stops at first error if <code>true</code>.
71       * @param extractionContext
72       *            the extraction context where the parser is used.
73       * @param extractionResult
74       *            the output extraction result.
75       * 
76       * @return a new instance of a configured Turtle parser.
77       */
78      public RDFParser getTurtleParserInstance(final boolean verifyDataType, final boolean stopAtFirstError,
79              final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
80          if (extractionResult == null) {
81              throw new NullPointerException("extractionResult cannot be null.");
82          }
83          final TurtleParser parser = new ExtendedTurtleParser();
84          configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
85          return parser;
86      }
87  
88      /**
89       * Returns a new instance of a configured RDFaParser, set to RDFa-1.0 compatibility mode.
90       *
91       * @param verifyDataType
92       *            data verification enable if <code>true</code>.
93       * @param stopAtFirstError
94       *            the parser stops at first error if <code>true</code>.
95       * @param extractionContext
96       *            the extraction context where the parser is used.
97       * @param extractionResult
98       *            the output extraction result.
99       * 
100      * @return a new instance of a configured RDFXML parser.
101      */
102     public RDFParser getRDFa10Parser(final boolean verifyDataType, final boolean stopAtFirstError,
103             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
104         final RDFParser parser = Rio.createParser(RDFFormat.RDFA);
105         parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_0);
106         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
107         return parser;
108     }
109 
110     /**
111      * Returns a new instance of a configured RDFaParser, set to RDFa-1.1 compatibility mode.
112      *
113      * @param verifyDataType
114      *            data verification enable if <code>true</code>.
115      * @param stopAtFirstError
116      *            the parser stops at first error if <code>true</code>.
117      * @param extractionContext
118      *            the extraction context where the parser is used.
119      * @param extractionResult
120      *            the output extraction result.
121      * 
122      * @return a new instance of a configured RDFXML parser.
123      */
124     public RDFParser getRDFa11Parser(final boolean verifyDataType, final boolean stopAtFirstError,
125             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
126         final RDFParser parser = Rio.createParser(RDFFormat.RDFA);
127         parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_1);
128         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
129         return parser;
130     }
131 
132     /**
133      * Returns a new instance of a configured RDFXMLParser.
134      *
135      * @param verifyDataType
136      *            data verification enable if <code>true</code>.
137      * @param stopAtFirstError
138      *            the parser stops at first error if <code>true</code>.
139      * @param extractionContext
140      *            the extraction context where the parser is used.
141      * @param extractionResult
142      *            the output extraction result.
143      * 
144      * @return a new instance of a configured RDFXML parser.
145      */
146     public RDFParser getRDFXMLParser(final boolean verifyDataType, final boolean stopAtFirstError,
147             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
148         final RDFParser parser = Rio.createParser(RDFFormat.RDFXML);
149         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
150         return parser;
151     }
152 
153     /**
154      * Returns a new instance of a configured NTriplesParser.
155      *
156      * @param verifyDataType
157      *            data verification enable if <code>true</code>.
158      * @param stopAtFirstError
159      *            the parser stops at first error if <code>true</code>.
160      * @param extractionContext
161      *            the extraction context where the parser is used.
162      * @param extractionResult
163      *            the output extraction result.
164      * 
165      * @return a new instance of a configured NTriples parser.
166      */
167     public RDFParser getNTriplesParser(final boolean verifyDataType, final boolean stopAtFirstError,
168             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
169         final RDFParser parser = Rio.createParser(RDFFormat.NTRIPLES);
170         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
171         return parser;
172     }
173 
174     /**
175      * Returns a new instance of a configured NQuadsParser.
176      *
177      * @param verifyDataType
178      *            data verification enable if <code>true</code>.
179      * @param stopAtFirstError
180      *            the parser stops at first error if <code>true</code>.
181      * @param extractionContext
182      *            the extraction context where the parser is used.
183      * @param extractionResult
184      *            the output extraction result.
185      * 
186      * @return a new instance of a configured NQuads parser.
187      */
188     public RDFParser getNQuadsParser(final boolean verifyDataType, final boolean stopAtFirstError,
189             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
190         final RDFParser parser = Rio.createParser(RDFFormat.NQUADS);
191         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
192         return parser;
193     }
194 
195     /**
196      * Returns a new instance of a configured ManchesterSyntaxParser.
197      *
198      * @param verifyDataType
199      *            data verification enable if <code>true</code>.
200      * @param stopAtFirstError
201      *            the parser stops at first error if <code>true</code>.
202      * @param extractionContext
203      *            the extraction context where the parser is used.
204      * @param extractionResult
205      *            the output extraction result.
206      * 
207      * @return a new instance of a configured Manchester Syntax parser.
208      */
209     public RDFParser getManchesterSyntaxParser(final boolean verifyDataType, final boolean stopAtFirstError,
210             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
211         final RDFParser parser = Rio.createParser(OWLAPIRDFFormat.MANCHESTER_OWL);
212         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
213         return parser;
214     }
215 
216     /**
217      * Returns a new instance of a configured FunctionalSyntaxParser.
218      *
219      * @param verifyDataType
220      *            data verification enable if <code>true</code>.
221      * @param stopAtFirstError
222      *            the parser stops at first error if <code>true</code>.
223      * @param extractionContext
224      *            the extraction context where the parser is used.
225      * @param extractionResult
226      *            the output extraction result.
227      * 
228      * @return a new instance of a configured Functional Syntax parser.
229      */
230     public RDFParser getFunctionalSyntaxParser(final boolean verifyDataType, final boolean stopAtFirstError,
231             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
232         final RDFParser parser = Rio.createParser(OWLAPIRDFFormat.OWL_FUNCTIONAL);
233         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
234         return parser;
235     }
236 
237     /**
238      * Returns a new instance of a configured TriXParser.
239      *
240      * @param verifyDataType
241      *            data verification enable if <code>true</code>.
242      * @param stopAtFirstError
243      *            the parser stops at first error if <code>true</code>.
244      * @param extractionContext
245      *            the extraction context where the parser is used.
246      * @param extractionResult
247      *            the output extraction result.
248      * 
249      * @return a new instance of a configured TriX parser.
250      */
251     public RDFParser getTriXParser(final boolean verifyDataType, final boolean stopAtFirstError,
252             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
253         final RDFParser parser = Rio.createParser(RDFFormat.TRIX);
254         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
255         return parser;
256     }
257 
258     /**
259      * Returns a new instance of a configured <i>SesameJSONLDParser</i>.
260      * 
261      * @param verifyDataType
262      *            data verification enable if <code>true</code>.
263      * @param stopAtFirstError
264      *            the parser stops at first error if <code>true</code>.
265      * @param extractionContext
266      *            the extraction context where the parser is used.
267      * @param extractionResult
268      *            the output extraction result.
269      * 
270      * @return a new instance of a configured JSONLDParser parser.
271      */
272     public RDFParser getJSONLDParser(final boolean verifyDataType, final boolean stopAtFirstError,
273             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
274         final RDFParser parser = Rio.createParser(RDFFormat.JSONLD);
275         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
276         return parser;
277     }
278 
279     /**
280      * Configures the given parser on the specified extraction result setting the policies for data verification and
281      * error handling.
282      *
283      * @param parser
284      *            the parser to be configured.
285      * @param verifyDataType
286      *            enables the data verification.
287      * @param stopAtFirstError
288      *            enables the tolerant error handling.
289      * @param extractionContext
290      *            the extraction context in which the parser is used.
291      * @param extractionResult
292      *            the extraction result used to collect the parsed data.
293      */
294     // TODO: what about passing just default language and ErrorReport to configureParser() ?
295     private void configureParser(final RDFParser parser, final boolean verifyDataType, final boolean stopAtFirstError,
296             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
297         parser.getParserConfig().setNonFatalErrors(
298                 stopAtFirstError ? Collections.emptySet() : new HashSet<>(parser.getSupportedSettings()));
299         parser.getParserConfig().set(XMLParserSettings.LOAD_EXTERNAL_DTD, false);
300         parser.set(BasicParserSettings.FAIL_ON_UNKNOWN_DATATYPES, verifyDataType);
301         parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, verifyDataType);
302 
303         parser.setParseErrorListener(new InternalParseErrorListener(extractionResult));
304         parser.setValueFactory(new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance(), extractionResult,
305                 extractionContext.getDefaultLanguage()));
306         parser.setRDFHandler(new RDFHandlerAdapter(extractionResult));
307     }
308 
309     /**
310      * Internal listener used to trace <i>RDF</i> parse errors.
311      */
312     private static class InternalParseErrorListener implements ParseErrorListener {
313 
314         private final IssueReport extractionResult;
315 
316         public InternalParseErrorListener(IssueReport er) {
317             extractionResult = er;
318         }
319 
320         @Override
321         public void warning(String msg, long lineNo, long colNo) {
322             try {
323                 extractionResult.notifyIssue(IssueReport.IssueLevel.WARNING, msg, lineNo, colNo);
324             } catch (Exception e) {
325                 notifyExceptionInNotification(e);
326             }
327         }
328 
329         @Override
330         public void error(String msg, long lineNo, long colNo) {
331             try {
332                 extractionResult.notifyIssue(IssueReport.IssueLevel.ERROR, msg, lineNo, colNo);
333             } catch (Exception e) {
334                 notifyExceptionInNotification(e);
335             }
336         }
337 
338         @Override
339         public void fatalError(String msg, long lineNo, long colNo) {
340             try {
341                 extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, msg, lineNo, colNo);
342             } catch (Exception e) {
343                 notifyExceptionInNotification(e);
344             }
345         }
346 
347         private void notifyExceptionInNotification(Exception e) {
348             if (logger != null) {
349                 logger.error("An exception occurred while notifying an error.", e);
350             }
351         }
352     }
353 
354     /**
355      * This extended Turtle parser sets the default namespace to the base IRI before the parsing.
356      */
357     private static class ExtendedTurtleParser extends TurtleParser {
358         @Override
359         public void parse(Reader reader, String baseIRI) throws IOException, RDFParseException, RDFHandlerException {
360             setNamespace("", baseIRI);
361             super.parse(reader, baseIRI);
362         }
363 
364         @Override
365         public void parse(InputStream in, String baseIRI) throws IOException, RDFParseException, RDFHandlerException {
366             setNamespace("", baseIRI);
367             super.parse(in, baseIRI);
368         }
369     }
370 }