View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.configuration.Configuration;
21  import org.apache.any23.configuration.DefaultConfiguration;
22  import org.apache.any23.encoding.EncodingDetector;
23  import org.apache.any23.encoding.TikaEncodingDetector;
24  import org.apache.any23.extractor.html.DocumentReport;
25  import org.apache.any23.extractor.html.HTMLDocument;
26  import org.apache.any23.extractor.html.MicroformatExtractor;
27  import org.apache.any23.extractor.html.TagSoupParser;
28  import org.apache.any23.mime.MIMEType;
29  import org.apache.any23.mime.MIMETypeDetector;
30  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
31  import org.apache.any23.rdf.RDFUtils;
32  import org.apache.any23.source.DocumentSource;
33  import org.apache.any23.source.LocalCopyFactory;
34  import org.apache.any23.source.MemCopyFactory;
35  import org.apache.any23.validator.EmptyValidationReport;
36  import org.apache.any23.validator.ValidatorException;
37  import org.apache.any23.vocab.SINDICE;
38  import org.apache.any23.writer.CompositeTripleHandler;
39  import org.apache.any23.writer.CountingTripleHandler;
40  import org.apache.any23.writer.TripleHandler;
41  import org.apache.any23.writer.TripleHandlerException;
42  import org.apache.any23.extractor.Extractor.BlindExtractor;
43  import org.apache.any23.extractor.Extractor.ContentExtractor;
44  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
45  import org.apache.tika.mime.MimeTypes;
46  import org.eclipse.rdf4j.model.BNode;
47  import org.eclipse.rdf4j.model.IRI;
48  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
49  import org.slf4j.Logger;
50  import org.slf4j.LoggerFactory;
51  
52  import java.io.BufferedInputStream;
53  import java.io.ByteArrayOutputStream;
54  import java.io.IOException;
55  import java.io.InputStream;
56  import java.io.PrintStream;
57  import java.net.URISyntaxException;
58  import java.nio.charset.StandardCharsets;
59  import java.util.ArrayList;
60  import java.util.Collection;
61  import java.util.Collections;
62  import java.util.Date;
63  import java.util.HashMap;
64  import java.util.List;
65  import java.util.Locale;
66  import java.util.Map;
67  import java.util.UUID;
68  import java.util.stream.Collectors;
69  
70  import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
71  import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
72  
73  /**
74   * This class acts as a facade where all extractors (for a given MIMEType) can be called on a single document.
75   * Extractors are automatically filtered by MIMEType.
76   */
77  public class SingleDocumentExtraction {
78  
79      private static final SINDICE vSINDICE = SINDICE.getInstance();
80  
81      private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
82  
83      private final Configuration configuration;
84  
85      private final DocumentSource in;
86  
87      private IRI documentIRI;
88  
89      private final ExtractorGroup extractors;
90  
91      private final TripleHandler output;
92  
93      private final EncodingDetector encoderDetector;
94  
95      private LocalCopyFactory copyFactory = null;
96  
97      private DocumentSource localDocumentSource = null;
98  
99      private MIMETypeDetector detector = null;
100 
101     private ExtractorGroup matchingExtractors = null;
102 
103     private MIMEType detectedMIMEType = null;
104 
105     private DocumentReport documentReport = null;
106 
107     private ExtractionParameters tagSoupDOMRelatedParameters = null;
108 
109     private String parserEncoding = null;
110 
111     /**
112      * Builds an extractor by the specification of document source, list of extractors and output triple handler.
113      *
114      * @param configuration
115      *            configuration applied during extraction.
116      * @param in
117      *            input document source.
118      * @param extractors
119      *            list of extractors to be applied.
120      * @param output
121      *            output triple handler.
122      */
123     public SingleDocumentExtraction(Configuration configuration, DocumentSource in, ExtractorGroup extractors,
124             TripleHandler output) {
125         if (configuration == null)
126             throw new NullPointerException("configuration cannot be null.");
127         if (in == null)
128             throw new NullPointerException("in cannot be null.");
129         this.configuration = configuration;
130         this.in = in;
131         this.extractors = extractors;
132 
133         List<TripleHandler> tripleHandlers = new ArrayList<>();
134         tripleHandlers.add(output);
135         tripleHandlers.add(new CountingTripleHandler());
136         this.output = new CompositeTripleHandler(tripleHandlers);
137         this.encoderDetector = new TikaEncodingDetector();
138     }
139 
140     /**
141      * Builds an extractor by the specification of document source, extractors factory and output triple handler.
142      *
143      * @param configuration
144      *            configuration applied during extraction.
145      * @param in
146      *            input document source.
147      * @param factory
148      *            the extractors factory.
149      * @param output
150      *            output triple handler.
151      */
152     public SingleDocumentExtraction(Configuration configuration, DocumentSource in, ExtractorFactory<?> factory,
153             TripleHandler output) {
154         this(configuration, in, new ExtractorGroup(Collections.<ExtractorFactory<?>> singletonList(factory)), output);
155         this.setMIMETypeDetector(null);
156     }
157 
158     /**
159      * Builds an extractor by the specification of document source, extractors factory and output triple handler, using
160      * the {@link org.apache.any23.configuration.DefaultConfiguration}.
161      *
162      * @param in
163      *            input document source.
164      * @param factory
165      *            the extractors factory.
166      * @param output
167      *            output triple handler.
168      */
169     public SingleDocumentExtraction(DocumentSource in, ExtractorFactory<?> factory, TripleHandler output) {
170         this(DefaultConfiguration.singleton(), in,
171                 new ExtractorGroup(Collections.<ExtractorFactory<?>> singletonList(factory)), output);
172         this.setMIMETypeDetector(null);
173     }
174 
175     /**
176      * Sets the internal factory for generating the document local copy, if <code>null</code> the
177      * {@link org.apache.any23.source.MemCopyFactory} will be used.
178      *
179      * @param copyFactory
180      *            local copy factory.
181      * 
182      * @see org.apache.any23.source.DocumentSource
183      */
184     public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
185         this.copyFactory = copyFactory;
186     }
187 
188     /**
189      * Sets the internal mime type detector, if <code>null</code> mimetype detection will be skipped and all extractors
190      * will be activated.
191      *
192      * @param detector
193      *            detector instance.
194      */
195     public void setMIMETypeDetector(MIMETypeDetector detector) {
196         this.detector = detector;
197     }
198 
199     /**
200      * Triggers the execution of all the {@link Extractor} registered to this class using the specified extraction
201      * parameters.
202      *
203      * @param extractionParameters
204      *            the parameters applied to the run execution.
205      * 
206      * @return the report generated by the extraction.
207      * 
208      * @throws ExtractionException
209      *             if an error occurred during the data extraction.
210      * @throws IOException
211      *             if an error occurred during the data access.
212      */
213     public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
214             throws ExtractionException, IOException {
215         if (extractionParameters == null) {
216             extractionParameters = ExtractionParameters.newDefault(configuration);
217         }
218 
219         final String contextIRI = extractionParameters
220                 .getProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY);
221         ensureHasLocalCopy();
222         try {
223             this.documentIRI = new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance())
224                     .createIRI("?".equals(contextIRI) ? in.getDocumentIRI() : contextIRI);
225         } catch (Exception ex) {
226             throw new IllegalArgumentException("Invalid IRI: " + in.getDocumentIRI(), ex);
227         }
228         if (log.isDebugEnabled()) {
229             log.debug("Processing " + this.documentIRI);
230         }
231         filterExtractorsByMIMEType();
232 
233         if (log.isDebugEnabled()) {
234             StringBuilder sb = new StringBuilder("Extractors ");
235             for (ExtractorFactory<?> factory : matchingExtractors) {
236                 sb.append(factory.getExtractorName());
237                 sb.append(' ');
238             }
239             sb.append("match ").append(documentIRI);
240             log.debug(sb.toString());
241         }
242 
243         final List<ResourceRoot> resourceRoots = new ArrayList<>();
244         final List<PropertyPath> propertyPaths = new ArrayList<>();
245         final Map<String, Collection<IssueReport.Issue>> extractorToIssues = new HashMap<>();
246 
247         // Invoke all extractors.
248         try {
249             output.startDocument(documentIRI);
250         } catch (TripleHandlerException e) {
251             log.error(String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI));
252             throw new ExtractionException(
253                     String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
254         }
255         try {
256             output.setContentLength(in.getContentLength());
257             // Create the document context.
258             final String documentLanguage;
259             try {
260                 documentLanguage = extractDocumentLanguage(extractionParameters);
261                 ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
262                 final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType);
263                 ArrayList<String> intersectionOfRdfMimetypes = null;
264                 for (ExtractorFactory<?> factory : matchingExtractors) {
265                     final Extractor<?> extractor = factory.createExtractor();
266                     final SingleExtractionReport er = runExtractor(extractionParameters, documentLanguage, extractor);
267                     // Fix for ANY23-415:
268                     if (mimeTypeIsTooGeneric) {
269                         List<String> rdfMimetypes = factory.getSupportedMIMETypes().stream()
270                                 .filter(mt -> !isTooGeneric(mt)).map(MIMEType::getFullType)
271                                 .collect(Collectors.toList());
272                         if (er.touched) {
273                             // If detected mimetype is too generic, but we find extractors matching
274                             // this mimetype that are capable of producing RDF triples from this resource,
275                             // and these extractors are also associated with more specific RDF mimetypes,
276                             // then we can simply take the intersection of these more specific mimetypes
277                             // to narrow down the generic, non-RDF mimetype to a specific RDF mimetype.
278                             if (intersectionOfRdfMimetypes == null) {
279                                 intersectionOfRdfMimetypes = new ArrayList<>(rdfMimetypes);
280                             } else {
281                                 intersectionOfRdfMimetypes.retainAll(rdfMimetypes);
282                             }
283                         } else if (!rdfMimetypes.isEmpty()) {
284                             // If detected mimetype is too generic, and this extractor matches both the
285                             // generic mimetype and a more specific mimetype, but did not produce any RDF
286                             // triples, then we can safely assume that this extractor is not actually a
287                             // match for the type of file we are parsing (e.g., a "humans.txt" file).
288                             continue;
289                         }
290                     }
291                     resourceRoots.addAll(er.resourceRoots);
292                     propertyPaths.addAll(er.propertyPaths);
293                     filteredList.add(factory);
294                     extractorToIssues.put(factory.getExtractorName(), er.issues);
295                 }
296                 matchingExtractors = new ExtractorGroup(filteredList);
297                 if (intersectionOfRdfMimetypes != null && !intersectionOfRdfMimetypes.isEmpty()) {
298                     // If the detected mimetype is a generic, non-RDF mimetype, and the intersection
299                     // of specific RDF mimetypes across all triple-producing extractors is non-empty,
300                     // simply replace the generic mimetype with a specific RDF mimetype in that intersection.
301                     detectedMIMEType = MIMEType.parse(intersectionOfRdfMimetypes.get(0));
302                 }
303             } catch (ValidatorException ve) {
304                 throw new ExtractionException("An error occurred during the validation phase.", ve);
305             }
306 
307             // Resource consolidation.
308             final boolean addDomainTriples = extractionParameters
309                     .getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG);
310             final ExtractionContext consolidationContext;
311             if (extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) {
312                 // Consolidation with nesting.
313                 consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output,
314                         documentLanguage);
315             } else {
316                 consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output, documentLanguage);
317             }
318 
319             // Adding time/size meta triples.
320             if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
321                 try {
322                     addExtractionTimeSizeMetaTriples(consolidationContext);
323                 } catch (TripleHandlerException e) {
324                     throw new ExtractionException(
325                             String.format(Locale.ROOT,
326                                     "Error while adding extraction metadata triples document with IRI %s", documentIRI),
327                             e);
328                 }
329             }
330         } finally {
331             try {
332                 output.endDocument(documentIRI);
333             } catch (TripleHandlerException e) {
334                 log.error(String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI));
335                 throw new ExtractionException(
336                         String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI), e);
337             }
338         }
339 
340         return new SingleDocumentExtractionReport(
341                 documentReport == null ? EmptyValidationReport.getInstance() : documentReport.getReport(),
342                 extractorToIssues);
343     }
344 
345     private static boolean isTooGeneric(MIMEType type) {
346         if (type == null || type.isAnySubtype()) {
347             return true;
348         }
349         String mt = type.getFullType();
350         return mt.equals(MimeTypes.PLAIN_TEXT) || mt.equals(MimeTypes.OCTET_STREAM) || mt.equals(MimeTypes.XML);
351     }
352 
353     /**
354      * Triggers the execution of all the {@link Extractor} registered to this class using the <i>default</i> extraction
355      * parameters.
356      *
357      * @throws IOException
358      *             if there is an error reading input from the document source
359      * @throws ExtractionException
360      *             if there is an error duing distraction
361      * 
362      * @return the extraction report.
363      */
364     public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
365         return run(ExtractionParameters.newDefault(configuration));
366     }
367 
368     /**
369      * Returns the detected mimetype for the given {@link org.apache.any23.source.DocumentSource}.
370      *
371      * @return string containing the detected mimetype.
372      * 
373      * @throws IOException
374      *             if an error occurred while accessing the data.
375      */
376     public String getDetectedMIMEType() throws IOException {
377         filterExtractorsByMIMEType();
378         return detectedMIMEType == null ? null : detectedMIMEType.toString();
379     }
380 
381     /**
382      * Check whether the given {@link org.apache.any23.source.DocumentSource} content activates of not at least an
383      * extractor.
384      *
385      * @return <code>true</code> if at least an extractor is activated, <code>false</code> otherwise.
386      * 
387      * @throws IOException
388      *             if there is an error locating matching extractors
389      */
390     public boolean hasMatchingExtractors() throws IOException {
391         filterExtractorsByMIMEType();
392         return !matchingExtractors.isEmpty();
393     }
394 
395     /**
396      * @return the list of all the activated extractors for the given {@link org.apache.any23.source.DocumentSource}.
397      */
398     @SuppressWarnings("rawtypes")
399     public List<Extractor> getMatchingExtractors() {
400         final List<Extractor> extractorsList = new ArrayList<>();
401         for (ExtractorFactory extractorFactory : matchingExtractors) {
402             extractorsList.add(extractorFactory.createExtractor());
403         }
404         return extractorsList;
405     }
406 
407     /**
408      * @return the configured parsing encoding.
409      */
410     public String getParserEncoding() {
411         if (this.parserEncoding == null) {
412             this.parserEncoding = detectEncoding();
413         }
414         return this.parserEncoding;
415     }
416 
417     /**
418      * Sets the document parser encoding.
419      *
420      * @param encoding
421      *            parser encoding.
422      */
423     public void setParserEncoding(String encoding) {
424         this.parserEncoding = encoding;
425         documentReport = null;
426     }
427 
428     /**
429      * Chech whether the given {@link org.apache.any23.source.DocumentSource} is an <b>HTML</b> document.
430      *
431      * @return <code>true</code> if the document source is an HTML document.
432      * 
433      * @throws IOException
434      *             if an error occurs while accessing data.
435      */
436     private boolean isHTMLDocument() throws IOException {
437         filterExtractorsByMIMEType();
438         return !matchingExtractors.filterByMIMEType(MIMEType.parse("text/html")).isEmpty();
439     }
440 
441     /**
442      * Extracts the document language where possible.
443      *
444      * @param extractionParameters
445      *            extraction parameters to be applied to determine the document language.
446      * 
447      * @return the document language if any, <code>null</code> otherwise.
448      * 
449      * @throws java.io.IOException
450      *             if an error occurs during the document analysis.
451      * @throws org.apache.any23.validator.ValidatorException
452      */
453     private String extractDocumentLanguage(ExtractionParameters extractionParameters)
454             throws IOException, ValidatorException {
455         if (!isHTMLDocument()) {
456             return null;
457         }
458         final HTMLDocument document;
459         try {
460             document = new HTMLDocument(getTagSoupDOM(extractionParameters).getDocument());
461         } catch (IOException ioe) {
462             log.debug("Cannot extract language from document.", ioe);
463             return null;
464         }
465         return document.getDefaultLanguage();
466     }
467 
468     /**
469      * Generates a list of extractors that can be applied to the given document.
470      *
471      * @throws IOException
472      */
473     private void filterExtractorsByMIMEType() throws IOException {
474         if (matchingExtractors != null)
475             return; // has already been run.
476 
477         if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
478             matchingExtractors = extractors;
479             return;
480         }
481         ensureHasLocalCopy();
482         // detect MIME based on the real file IRI rather than based on given base namespace
483         detectedMIMEType = detector.guessMIMEType(java.net.URI.create(in.getDocumentIRI()).getPath(),
484                 localDocumentSource.openInputStream(), MIMEType.parse(localDocumentSource.getContentType()));
485         log.debug("detected media type: " + detectedMIMEType);
486         matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
487     }
488 
489     /**
490      * Triggers the execution of a specific {@link Extractor}.
491      * 
492      * @param extractionParameters
493      *            the parameters used for the extraction.
494      * @param extractor
495      *            the {@link Extractor} to be executed.
496      * 
497      * @throws ExtractionException
498      *             if an error specific to an extractor happens.
499      * @throws IOException
500      *             if an IO error occurs during the extraction.
501      * 
502      * @return the roots of the resources that have been extracted.
503      * 
504      * @throws org.apache.any23.validator.ValidatorException
505      *             if an error occurs during validation.
506      */
507     private SingleExtractionReport runExtractor(final ExtractionParameters extractionParameters,
508             final String documentLanguage, final Extractor<?> extractor)
509             throws ExtractionException, IOException, ValidatorException {
510         if (log.isDebugEnabled()) {
511             log.debug("Running {} on {}", extractor.getDescription().getExtractorName(), documentIRI);
512         }
513         long startTime = System.currentTimeMillis();
514         final ExtractionContext extractionContext = new ExtractionContext(extractor.getDescription().getExtractorName(),
515                 documentIRI, documentLanguage);
516         final ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
517         try {
518             if (extractor instanceof BlindExtractor) {
519                 final BlindExtractor blindExtractor = (BlindExtractor) extractor;
520                 blindExtractor.run(extractionParameters, extractionContext, documentIRI, extractionResult);
521             } else if (extractor instanceof ContentExtractor) {
522                 ensureHasLocalCopy();
523                 final ContentExtractor contentExtractor = (ContentExtractor) extractor;
524                 contentExtractor.run(extractionParameters, extractionContext, localDocumentSource.openInputStream(),
525                         extractionResult);
526             } else if (extractor instanceof TagSoupDOMExtractor) {
527                 final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
528                 final DocumentReport documentReport = getTagSoupDOM(extractionParameters);
529                 tagSoupDOMExtractor.run(extractionParameters, extractionContext, documentReport.getDocument(),
530                         extractionResult);
531             } else {
532                 throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
533             }
534             return new SingleExtractionReport(extractionResult.getIssues(),
535                     new ArrayList<ResourceRoot>(extractionResult.getResourceRoots()),
536                     new ArrayList<PropertyPath>(extractionResult.getPropertyPaths()), extractionResult.wasTouched());
537         } catch (ExtractionException ex) {
538             if (log.isDebugEnabled()) {
539                 log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
540             }
541             throw ex;
542         } finally {
543             // Logging result error report.
544             if (log.isDebugEnabled() && extractionResult.hasIssues()) {
545                 ByteArrayOutputStream baos = new ByteArrayOutputStream();
546                 extractionResult.printReport(new PrintStream(baos, true, "UTF-8"));
547                 log.debug(baos.toString("UTF-8"));
548             }
549             extractionResult.close();
550 
551             long elapsed = System.currentTimeMillis() - startTime;
552             if (log.isDebugEnabled()) {
553                 log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
554             }
555         }
556     }
557 
558     /**
559      * Forces the retrieval of the document data.
560      *
561      * @throws IOException
562      */
563     private void ensureHasLocalCopy() throws IOException {
564         if (localDocumentSource != null)
565             return;
566         if (in.isLocal()) {
567             localDocumentSource = in;
568             return;
569         }
570         if (copyFactory == null) {
571             copyFactory = new MemCopyFactory();
572         }
573         localDocumentSource = copyFactory.createLocalCopy(in);
574     }
575 
576     /**
577      * Returns the DOM of the given document source (that must be an HTML stream) and the report of eventual fixes
578      * applied on it.
579      *
580      * @param extractionParameters
581      *            parameters to be used during extraction.
582      * 
583      * @return document report.
584      * 
585      * @throws IOException
586      *             if an error occurs during data access.
587      * @throws ValidatorException
588      *             if an error occurs during validation.
589      */
590     private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
591             throws IOException, ValidatorException {
592         if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
593             ensureHasLocalCopy();
594             final InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
595             is.mark(Integer.MAX_VALUE);
596             final String candidateEncoding = getParserEncoding();
597             is.reset();
598             final TagSoupParser tagSoupParser = new TagSoupParser(is, documentIRI.stringValue(), candidateEncoding);
599             if (extractionParameters.isValidate()) {
600                 documentReport = tagSoupParser.getValidatedDOM(extractionParameters.isFix());
601             } else {
602                 documentReport = new DocumentReport(EmptyValidationReport.getInstance(), tagSoupParser.getDOM());
603             }
604             tagSoupDOMRelatedParameters = extractionParameters;
605         }
606         return documentReport;
607     }
608 
609     /**
610      * Detects the encoding of the local document source input stream.
611      * 
612      * @return a valid encoding value.
613      */
614     private String detectEncoding() {
615         try {
616             ensureHasLocalCopy();
617             InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
618             String encoding = this.encoderDetector.guessEncoding(is, localDocumentSource.getContentType());
619             is.close();
620             return encoding;
621         } catch (Exception e) {
622             throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
623         }
624     }
625 
626     /**
627      * This function verifies if the <i>candidateSub</i> list of strings is a prefix of <i>list</i>.
628      *
629      * @param list
630      *            a list of strings.
631      * @param candidateSub
632      *            a list of strings.
633      * 
634      * @return <code>true</code> if <i>candidateSub</i> is a sub path of <i>list</i>, <code>false</code> otherwise.
635      */
636     private boolean subPath(String[] list, String[] candidateSub) {
637         if (candidateSub.length > list.length) {
638             return false;
639         }
640         for (int i = 0; i < candidateSub.length; i++) {
641             if (!candidateSub[i].equals(list[i])) {
642                 return false;
643             }
644         }
645         return true;
646     }
647 
648     /**
649      * Adds for every resource root node a page domain triple.
650      *
651      * @param resourceRoots
652      *            list of resource roots.
653      * @param context
654      *            extraction context to produce triples.
655      * 
656      * @throws ExtractionException
657      */
658     private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
659             throws ExtractionException {
660         try {
661             // Add source Web domains to every resource root.
662             String domain;
663             try {
664                 domain = new java.net.URI(in.getDocumentIRI()).getHost();
665             } catch (URISyntaxException urise) {
666                 throw new IllegalArgumentException("An error occurred while extracting the host from the document IRI.",
667                         urise);
668             }
669             if (domain != null) {
670                 for (ResourceRoot resourceRoot : resourceRoots) {
671                     output.receiveTriple(resourceRoot.getRoot(), vSINDICE.getProperty(SINDICE.DOMAIN),
672                             SimpleValueFactory.getInstance().createLiteral(domain), null, context);
673                 }
674             }
675         } catch (TripleHandlerException e) {
676             throw new ExtractionException("Error while writing triple triple.", e);
677         } finally {
678             try {
679                 output.closeContext(context);
680             } catch (TripleHandlerException e) {
681                 throw new ExtractionException("Error while closing context.", e);
682             }
683         }
684     }
685 
686     /**
687      * @return an extraction context specific for consolidation triples.
688      */
689     private ExtractionContext createExtractionContext(String defaultLanguage) {
690         return new ExtractionContext("consolidation-extractor", documentIRI, defaultLanguage,
691                 UUID.randomUUID().toString());
692     }
693 
694     /**
695      * Detect the nesting relationship among different Microformats and explicit them adding connection triples.
696      *
697      * @param resourceRoots
698      * @param propertyPaths
699      * @param context
700      * 
701      * @throws TripleHandlerException
702      */
703     private void addNestingRelationship(List<ResourceRoot> resourceRoots, List<PropertyPath> propertyPaths,
704             ExtractionContext context) throws TripleHandlerException {
705         ResourceRoot currentResourceRoot;
706         PropertyPath currentPropertyPath;
707         for (int r = 0; r < resourceRoots.size(); r++) {
708             currentResourceRoot = resourceRoots.get(r);
709             for (int p = 0; p < propertyPaths.size(); p++) {
710                 currentPropertyPath = propertyPaths.get(p);
711                 Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
712                 Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
713                 // Avoid wrong nesting relationships.
714                 if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
715                     continue;
716                 }
717                 // Avoid self declaring relationships
718                 if (MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
719                     continue;
720                 }
721                 if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
722                     createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
723                 }
724             }
725         }
726     }
727 
728     /**
729      * This method consolidates the graphs extracted from the same document. In particular it adds:
730      * <ul>
731      * <li>for every microformat root node a triple indicating the original Web page domain;</li>
732      * <li>triples indicating the nesting relationship among a microformat root and property paths of other nested
733      * microformats.</li>
734      * </ul>
735      * 
736      * @param resourceRoots
737      *            list of RDF nodes representing roots of extracted microformat graphs and the corresponding HTML paths.
738      * @param propertyPaths
739      *            list of RDF nodes representing property subjects, property IRIs and the HTML paths from which such
740      *            properties have been extracted.
741      * @param addDomainTriples
742      * @param output
743      *            a triple handler event collector.
744      * 
745      * @return
746      * 
747      * @throws ExtractionException
748      */
749     private ExtractionContext consolidateResources(List<ResourceRoot> resourceRoots, List<PropertyPath> propertyPaths,
750             boolean addDomainTriples, TripleHandler output, String defaultLanguage) throws ExtractionException {
751         final ExtractionContext context = createExtractionContext(defaultLanguage);
752 
753         try {
754             output.openContext(context);
755         } catch (TripleHandlerException e) {
756             throw new ExtractionException(
757                     String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
758         }
759 
760         try {
761             if (addDomainTriples) {
762                 addDomainTriplesPerResourceRoots(resourceRoots, context);
763             }
764             addNestingRelationship(resourceRoots, propertyPaths, context);
765         } catch (TripleHandlerException the) {
766             throw new ExtractionException("Error while writing triple triple.", the);
767         } finally {
768             try {
769                 output.closeContext(context);
770             } catch (TripleHandlerException e) {
771                 throw new ExtractionException("Error while closing context.", e);
772             }
773         }
774 
775         return context;
776     }
777 
778     /**
779      * This method consolidates the graphs extracted from the same document. In particular it adds:
780      * <ul>
781      * <li>for every microformat root node a triple indicating the original Web page domain;</li>
782      * </ul>
783      * 
784      * @param resourceRoots
785      *            list of RDF nodes representing roots of extracted microformat graphs and the corresponding HTML paths.
786      *            from which such properties have been extracted.
787      * @param addDomainTriples
788      * @param output
789      *            a triple handler event collector.
790      * 
791      * @return
792      * 
793      * @throws ExtractionException
794      */
795     private ExtractionContext consolidateResources(List<ResourceRoot> resourceRoots, boolean addDomainTriples,
796             TripleHandler output, String defaultLanguage) throws ExtractionException {
797         final ExtractionContext context = createExtractionContext(defaultLanguage);
798 
799         try {
800             output.openContext(context);
801         } catch (TripleHandlerException e) {
802             throw new ExtractionException(
803                     String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
804         }
805 
806         try {
807             if (addDomainTriples) {
808                 addDomainTriplesPerResourceRoots(resourceRoots, context);
809             }
810         } finally {
811             try {
812                 output.closeContext(context);
813             } catch (TripleHandlerException the) {
814                 throw new ExtractionException("Error while closing context.", the);
815             }
816         }
817 
818         return context;
819     }
820 
821     /**
822      * Adds metadata triples containing the number of extracted triples and the extraction timestamp.
823      *
824      * @param context
825      * 
826      * @throws TripleHandlerException
827      */
828     private void addExtractionTimeSizeMetaTriples(ExtractionContext context) throws TripleHandlerException {
829         // adding extraction date
830         String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
831         output.receiveTriple(SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
832                 vSINDICE.getProperty(SINDICE.DATE), SimpleValueFactory.getInstance().createLiteral(xsdDateTimeNow),
833                 null, context);
834 
835         // adding number of extracted triples
836         int numberOfTriples = 0;
837         CompositeTripleHandler cth = (CompositeTripleHandler) output;
838         for (TripleHandler th : cth.getChilds()) {
839             if (th instanceof CountingTripleHandler) {
840                 numberOfTriples = ((CountingTripleHandler) th).getCount();
841             }
842         }
843         output.receiveTriple(SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
844                 vSINDICE.getProperty(SINDICE.SIZE), SimpleValueFactory.getInstance().createLiteral(numberOfTriples + 1), // the
845                                                                                                                          // number
846                                                                                                                          // of
847                                                                                                                          // triples
848                                                                                                                          // plus
849                                                                                                                          // itself
850                 null, context);
851     }
852 
853     /**
854      * Creates a nesting relationship triple.
855      * 
856      * @param from
857      *            the property containing the nested microformat.
858      * @param to
859      *            the root to the nested microformat.
860      * @param th
861      *            the triple handler.
862      * @param ec
863      *            the extraction context used to add such information.
864      * 
865      * @throws org.apache.any23.writer.TripleHandlerException
866      */
867     private void createNestingRelationship(PropertyPath from, ResourceRoot to, TripleHandler th, ExtractionContext ec)
868             throws TripleHandlerException {
869         final BNode fromObject = from.getObject();
870         final String bNodeHash = from.getProperty().stringValue() + (fromObject == null ? "" : fromObject.getID());
871         BNode bnode = RDFUtils.getBNode(bNodeHash);
872         th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec);
873         th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
874                 from.getObject() == null ? to.getRoot() : from.getObject(), null, ec);
875         th.receiveTriple(from.getSubject(), vSINDICE.getProperty(SINDICE.NESTING), bnode, null, ec);
876     }
877 
878     /**
879      * Entity detection report.
880      */
881     private static class SingleExtractionReport {
882         private final Collection<IssueReport.Issue> issues;
883         private final List<ResourceRoot> resourceRoots;
884         private final List<PropertyPath> propertyPaths;
885         private final boolean touched;
886 
887         public SingleExtractionReport(Collection<IssueReport.Issue> issues, List<ResourceRoot> resourceRoots,
888                 List<PropertyPath> propertyPaths, boolean wasTouched) {
889             this.issues = issues;
890             this.resourceRoots = resourceRoots;
891             this.propertyPaths = propertyPaths;
892             this.touched = wasTouched;
893         }
894     }
895 
896 }