1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor;
19
20 import org.apache.any23.configuration.Configuration;
21 import org.apache.any23.configuration.DefaultConfiguration;
22 import org.apache.any23.encoding.EncodingDetector;
23 import org.apache.any23.encoding.TikaEncodingDetector;
24 import org.apache.any23.extractor.html.DocumentReport;
25 import org.apache.any23.extractor.html.HTMLDocument;
26 import org.apache.any23.extractor.html.MicroformatExtractor;
27 import org.apache.any23.extractor.html.TagSoupParser;
28 import org.apache.any23.mime.MIMEType;
29 import org.apache.any23.mime.MIMETypeDetector;
30 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
31 import org.apache.any23.rdf.RDFUtils;
32 import org.apache.any23.source.DocumentSource;
33 import org.apache.any23.source.LocalCopyFactory;
34 import org.apache.any23.source.MemCopyFactory;
35 import org.apache.any23.validator.EmptyValidationReport;
36 import org.apache.any23.validator.ValidatorException;
37 import org.apache.any23.vocab.SINDICE;
38 import org.apache.any23.writer.CompositeTripleHandler;
39 import org.apache.any23.writer.CountingTripleHandler;
40 import org.apache.any23.writer.TripleHandler;
41 import org.apache.any23.writer.TripleHandlerException;
42 import org.apache.any23.extractor.Extractor.BlindExtractor;
43 import org.apache.any23.extractor.Extractor.ContentExtractor;
44 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
45 import org.apache.tika.mime.MimeTypes;
46 import org.eclipse.rdf4j.model.BNode;
47 import org.eclipse.rdf4j.model.IRI;
48 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
49 import org.slf4j.Logger;
50 import org.slf4j.LoggerFactory;
51
52 import java.io.BufferedInputStream;
53 import java.io.ByteArrayOutputStream;
54 import java.io.IOException;
55 import java.io.InputStream;
56 import java.io.PrintStream;
57 import java.net.URISyntaxException;
58 import java.nio.charset.StandardCharsets;
59 import java.util.ArrayList;
60 import java.util.Collection;
61 import java.util.Collections;
62 import java.util.Date;
63 import java.util.HashMap;
64 import java.util.List;
65 import java.util.Locale;
66 import java.util.Map;
67 import java.util.UUID;
68 import java.util.stream.Collectors;
69
70 import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
71 import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
72
73
74
75
76 public class SingleDocumentExtraction {
77
78 private static final SINDICE vSINDICE = SINDICE.getInstance();
79
80 private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
81
82 private final Configuration configuration;
83
84 private final DocumentSource in;
85
86 private IRI documentIRI;
87
88 private final ExtractorGroup extractors;
89
90 private final TripleHandler output;
91
92 private final EncodingDetector encoderDetector;
93
94 private LocalCopyFactory copyFactory = null;
95
96 private DocumentSource localDocumentSource = null;
97
98 private MIMETypeDetector detector = null;
99
100 private ExtractorGroup matchingExtractors = null;
101
102 private MIMEType detectedMIMEType = null;
103
104 private DocumentReport documentReport = null;
105
106 private ExtractionParameters tagSoupDOMRelatedParameters = null;
107
108 private String parserEncoding = null;
109
110
111
112
113
114
115
116
117
118
119
120
121
122 public SingleDocumentExtraction(Configuration configuration, DocumentSource in, ExtractorGroup extractors,
123 TripleHandler output) {
124 if (configuration == null)
125 throw new NullPointerException("configuration cannot be null.");
126 if (in == null)
127 throw new NullPointerException("in cannot be null.");
128 this.configuration = configuration;
129 this.in = in;
130 this.extractors = extractors;
131
132 List<TripleHandler> tripleHandlers = new ArrayList<>();
133 tripleHandlers.add(output);
134 tripleHandlers.add(new CountingTripleHandler());
135 this.output = new CompositeTripleHandler(tripleHandlers);
136 this.encoderDetector = new TikaEncodingDetector();
137 }
138
139
140
141
142
143
144
145
146
147
148
149
150
151 public SingleDocumentExtraction(Configuration configuration, DocumentSource in, ExtractorFactory<?> factory,
152 TripleHandler output) {
153 this(configuration, in, new ExtractorGroup(Collections.<ExtractorFactory<?>> singletonList(factory)), output);
154 this.setMIMETypeDetector(null);
155 }
156
157
158
159
160
161
162
163
164
165
166
167
168 public SingleDocumentExtraction(DocumentSource in, ExtractorFactory<?> factory, TripleHandler output) {
169 this(DefaultConfiguration.singleton(), in,
170 new ExtractorGroup(Collections.<ExtractorFactory<?>> singletonList(factory)), output);
171 this.setMIMETypeDetector(null);
172 }
173
174
175
176
177
178
179
180
181
182
183 public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
184 this.copyFactory = copyFactory;
185 }
186
187
188
189
190
191
192
193
194 public void setMIMETypeDetector(MIMETypeDetector detector) {
195 this.detector = detector;
196 }
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212 public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
213 throws ExtractionException, IOException {
214 if (extractionParameters == null) {
215 extractionParameters = ExtractionParameters.newDefault(configuration);
216 }
217
218 final String contextIRI = extractionParameters
219 .getProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY);
220 ensureHasLocalCopy();
221 try {
222 this.documentIRI = new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance())
223 .createIRI("?".equals(contextIRI) ? in.getDocumentIRI() : contextIRI);
224 } catch (Exception ex) {
225 throw new IllegalArgumentException("Invalid IRI: " + in.getDocumentIRI(), ex);
226 }
227 if (log.isDebugEnabled()) {
228 log.debug("Processing " + this.documentIRI);
229 }
230 filterExtractorsByMIMEType();
231
232 if (log.isDebugEnabled()) {
233 StringBuilder sb = new StringBuilder("Extractors ");
234 for (ExtractorFactory<?> factory : matchingExtractors) {
235 sb.append(factory.getExtractorName());
236 sb.append(' ');
237 }
238 sb.append("match ").append(documentIRI);
239 log.debug(sb.toString());
240 }
241
242 final List<ResourceRoot> resourceRoots = new ArrayList<>();
243 final List<PropertyPath> propertyPaths = new ArrayList<>();
244 final Map<String, Collection<IssueReport.Issue>> extractorToIssues = new HashMap<>();
245
246
247 try {
248 output.startDocument(documentIRI);
249 } catch (TripleHandlerException e) {
250 log.error(String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI));
251 throw new ExtractionException(
252 String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
253 }
254 try {
255 output.setContentLength(in.getContentLength());
256
257 final String documentLanguage;
258 try {
259 documentLanguage = extractDocumentLanguage(extractionParameters);
260 ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
261 final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType);
262 ArrayList<String> intersectionOfRdfMimetypes = null;
263 for (ExtractorFactory<?> factory : matchingExtractors) {
264 final Extractor<?> extractor = factory.createExtractor();
265 final SingleExtractionReport er = runExtractor(extractionParameters, documentLanguage, extractor);
266
267 if (mimeTypeIsTooGeneric) {
268 List<String> rdfMimetypes = factory.getSupportedMIMETypes().stream()
269 .filter(mt -> !isTooGeneric(mt)).map(MIMEType::getFullType)
270 .collect(Collectors.toList());
271 if (er.touched) {
272
273
274
275
276
277 if (intersectionOfRdfMimetypes == null) {
278 intersectionOfRdfMimetypes = new ArrayList<>(rdfMimetypes);
279 } else {
280 intersectionOfRdfMimetypes.retainAll(rdfMimetypes);
281 }
282 } else if (!rdfMimetypes.isEmpty()) {
283
284
285
286
287 continue;
288 }
289 }
290 resourceRoots.addAll(er.resourceRoots);
291 propertyPaths.addAll(er.propertyPaths);
292 filteredList.add(factory);
293 extractorToIssues.put(factory.getExtractorName(), er.issues);
294 }
295 matchingExtractors = new ExtractorGroup(filteredList);
296 if (intersectionOfRdfMimetypes != null && !intersectionOfRdfMimetypes.isEmpty()) {
297
298
299
300 detectedMIMEType = MIMEType.parse(intersectionOfRdfMimetypes.get(0));
301 }
302 } catch (ValidatorException ve) {
303 throw new ExtractionException("An error occurred during the validation phase.", ve);
304 }
305
306
307 final boolean addDomainTriples = extractionParameters
308 .getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG);
309 final ExtractionContext consolidationContext;
310 if (extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) {
311
312 consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output,
313 documentLanguage);
314 } else {
315 consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output, documentLanguage);
316 }
317
318
319 if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
320 try {
321 addExtractionTimeSizeMetaTriples(consolidationContext);
322 } catch (TripleHandlerException e) {
323 throw new ExtractionException(
324 String.format(Locale.ROOT,
325 "Error while adding extraction metadata triples document with IRI %s", documentIRI),
326 e);
327 }
328 }
329 } finally {
330 try {
331 output.endDocument(documentIRI);
332 } catch (TripleHandlerException e) {
333 log.error(String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI));
334 throw new ExtractionException(
335 String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI), e);
336 }
337 }
338
339 return new SingleDocumentExtractionReport(
340 documentReport == null ? EmptyValidationReport.getInstance() : documentReport.getReport(),
341 extractorToIssues);
342 }
343
344 private static boolean isTooGeneric(MIMEType type) {
345 if (type == null || type.isAnySubtype()) {
346 return true;
347 }
348 String mt = type.getFullType();
349 return mt.equals(MimeTypes.PLAIN_TEXT) || mt.equals(MimeTypes.OCTET_STREAM) || mt.equals(MimeTypes.XML);
350 }
351
352
353
354
355
356
357
358
359
360
361
362
363 public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
364 return run(ExtractionParameters.newDefault(configuration));
365 }
366
367
368
369
370
371
372
373
374
375 public String getDetectedMIMEType() throws IOException {
376 filterExtractorsByMIMEType();
377 return detectedMIMEType == null ? null : detectedMIMEType.toString();
378 }
379
380
381
382
383
384
385
386
387
388
389 public boolean hasMatchingExtractors() throws IOException {
390 filterExtractorsByMIMEType();
391 return !matchingExtractors.isEmpty();
392 }
393
394
395
396
397 @SuppressWarnings("rawtypes")
398 public List<Extractor> getMatchingExtractors() {
399 final List<Extractor> extractorsList = new ArrayList<>();
400 for (ExtractorFactory extractorFactory : matchingExtractors) {
401 extractorsList.add(extractorFactory.createExtractor());
402 }
403 return extractorsList;
404 }
405
406
407
408
409 public String getParserEncoding() {
410 if (this.parserEncoding == null) {
411 this.parserEncoding = detectEncoding();
412 }
413 return this.parserEncoding;
414 }
415
416
417
418
419
420
421
422 public void setParserEncoding(String encoding) {
423 this.parserEncoding = encoding;
424 documentReport = null;
425 }
426
427
428
429
430
431
432
433
434
435 private boolean isHTMLDocument() throws IOException {
436 filterExtractorsByMIMEType();
437 return !matchingExtractors.filterByMIMEType(MIMEType.parse("text/html")).isEmpty();
438 }
439
440
441
442
443
444
445
446
447
448
449
450
451
452 private String extractDocumentLanguage(ExtractionParameters extractionParameters)
453 throws IOException, ValidatorException {
454 if (!isHTMLDocument()) {
455 return null;
456 }
457 final HTMLDocument document;
458 try {
459 document = new HTMLDocument(getTagSoupDOM(extractionParameters).getDocument());
460 } catch (IOException ioe) {
461 log.debug("Cannot extract language from document.", ioe);
462 return null;
463 }
464 return document.getDefaultLanguage();
465 }
466
467
468
469
470
471
472 private void filterExtractorsByMIMEType() throws IOException {
473 if (matchingExtractors != null)
474 return;
475
476 if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
477 matchingExtractors = extractors;
478 return;
479 }
480 ensureHasLocalCopy();
481
482 detectedMIMEType = detector.guessMIMEType(java.net.URI.create(in.getDocumentIRI()).getPath(),
483 localDocumentSource.openInputStream(), MIMEType.parse(localDocumentSource.getContentType()));
484 log.debug("detected media type: " + detectedMIMEType);
485 matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
486 }
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506 private SingleExtractionReport runExtractor(final ExtractionParameters extractionParameters,
507 final String documentLanguage, final Extractor<?> extractor)
508 throws ExtractionException, IOException, ValidatorException {
509 if (log.isDebugEnabled()) {
510 log.debug("Running {} on {}", extractor.getDescription().getExtractorName(), documentIRI);
511 }
512 long startTime = System.currentTimeMillis();
513 final ExtractionContextExtractionContext">ExtractionContext extractionContext = new ExtractionContext(extractor.getDescription().getExtractorName(),
514 documentIRI, documentLanguage);
515 final ExtractionResultImpl#ExtractionResultImpl">ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
516 try {
517 if (extractor instanceof BlindExtractor) {
518 final BlindExtractor blindExtractor = (BlindExtractor) extractor;
519 blindExtractor.run(extractionParameters, extractionContext, documentIRI, extractionResult);
520 } else if (extractor instanceof ContentExtractor) {
521 ensureHasLocalCopy();
522 final ContentExtractor contentExtractor = (ContentExtractor) extractor;
523 contentExtractor.run(extractionParameters, extractionContext, localDocumentSource.openInputStream(),
524 extractionResult);
525 } else if (extractor instanceof TagSoupDOMExtractor) {
526 final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
527 final DocumentReport documentReport = getTagSoupDOM(extractionParameters);
528 tagSoupDOMExtractor.run(extractionParameters, extractionContext, documentReport.getDocument(),
529 extractionResult);
530 } else {
531 throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
532 }
533 return new SingleExtractionReport(extractionResult.getIssues(),
534 new ArrayList<ResourceRoot>(extractionResult.getResourceRoots()),
535 new ArrayList<PropertyPath>(extractionResult.getPropertyPaths()), extractionResult.wasTouched());
536 } catch (ExtractionException ex) {
537 if (log.isDebugEnabled()) {
538 log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
539 }
540 throw ex;
541 } finally {
542
543 if (log.isDebugEnabled() && extractionResult.hasIssues()) {
544 ByteArrayOutputStream baos = new ByteArrayOutputStream();
545 extractionResult.printReport(new PrintStream(baos, true, "UTF-8"));
546 log.debug(baos.toString("UTF-8"));
547 }
548 extractionResult.close();
549
550 long elapsed = System.currentTimeMillis() - startTime;
551 if (log.isDebugEnabled()) {
552 log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
553 }
554 }
555 }
556
557
558
559
560
561
562 private void ensureHasLocalCopy() throws IOException {
563 if (localDocumentSource != null)
564 return;
565 if (in.isLocal()) {
566 localDocumentSource = in;
567 return;
568 }
569 if (copyFactory == null) {
570 copyFactory = new MemCopyFactory();
571 }
572 localDocumentSource = copyFactory.createLocalCopy(in);
573 }
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589 private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
590 throws IOException, ValidatorException {
591 if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
592 ensureHasLocalCopy();
593 final InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
594 is.mark(Integer.MAX_VALUE);
595 final String candidateEncoding = getParserEncoding();
596 is.reset();
597 final TagSoupParserser.html#TagSoupParser">TagSoupParser tagSoupParser = new TagSoupParser(is, documentIRI.stringValue(), candidateEncoding);
598 if (extractionParameters.isValidate()) {
599 documentReport = tagSoupParser.getValidatedDOM(extractionParameters.isFix());
600 } else {
601 documentReport = new DocumentReport(EmptyValidationReport.getInstance(), tagSoupParser.getDOM());
602 }
603 tagSoupDOMRelatedParameters = extractionParameters;
604 }
605 return documentReport;
606 }
607
608
609
610
611
612
613 private String detectEncoding() {
614 try {
615 ensureHasLocalCopy();
616 InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
617 String encoding = this.encoderDetector.guessEncoding(is, localDocumentSource.getContentType());
618 is.close();
619 return encoding;
620 } catch (Exception e) {
621 throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
622 }
623 }
624
625
626
627
628
629
630
631
632
633
634
635 private boolean subPath(String[] list, String[] candidateSub) {
636 if (candidateSub.length > list.length) {
637 return false;
638 }
639 for (int i = 0; i < candidateSub.length; i++) {
640 if (!candidateSub[i].equals(list[i])) {
641 return false;
642 }
643 }
644 return true;
645 }
646
647
648
649
650
651
652
653
654
655
656
657 private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
658 throws ExtractionException {
659 try {
660
661 String domain;
662 try {
663 domain = new java.net.URI(in.getDocumentIRI()).getHost();
664 } catch (URISyntaxException urise) {
665 throw new IllegalArgumentException("An error occurred while extracting the host from the document IRI.",
666 urise);
667 }
668 if (domain != null) {
669 for (ResourceRoot resourceRoot : resourceRoots) {
670 output.receiveTriple(resourceRoot.getRoot(), vSINDICE.getProperty(SINDICE.DOMAIN),
671 SimpleValueFactory.getInstance().createLiteral(domain), null, context);
672 }
673 }
674 } catch (TripleHandlerException e) {
675 throw new ExtractionException("Error while writing triple triple.", e);
676 } finally {
677 try {
678 output.closeContext(context);
679 } catch (TripleHandlerException e) {
680 throw new ExtractionException("Error while closing context.", e);
681 }
682 }
683 }
684
685
686
687
688 private ExtractionContext createExtractionContext(String defaultLanguage) {
689 return new ExtractionContext("consolidation-extractor", documentIRI, defaultLanguage,
690 UUID.randomUUID().toString());
691 }
692
693
694
695
696
697
698
699
700
701
702 private void addNestingRelationship(List<ResourceRoot> resourceRoots, List<PropertyPath> propertyPaths,
703 ExtractionContext context) throws TripleHandlerException {
704 ResourceRoot currentResourceRoot;
705 PropertyPath currentPropertyPath;
706 for (int r = 0; r < resourceRoots.size(); r++) {
707 currentResourceRoot = resourceRoots.get(r);
708 for (int p = 0; p < propertyPaths.size(); p++) {
709 currentPropertyPath = propertyPaths.get(p);
710 Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
711 Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
712
713 if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
714 continue;
715 }
716
717 if (MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
718 continue;
719 }
720 if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
721 createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
722 }
723 }
724 }
725 }
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748 private ExtractionContext consolidateResources(List<ResourceRoot> resourceRoots, List<PropertyPath> propertyPaths,
749 boolean addDomainTriples, TripleHandler output, String defaultLanguage) throws ExtractionException {
750 final ExtractionContext context = createExtractionContext(defaultLanguage);
751
752 try {
753 output.openContext(context);
754 } catch (TripleHandlerException e) {
755 throw new ExtractionException(
756 String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
757 }
758
759 try {
760 if (addDomainTriples) {
761 addDomainTriplesPerResourceRoots(resourceRoots, context);
762 }
763 addNestingRelationship(resourceRoots, propertyPaths, context);
764 } catch (TripleHandlerException the) {
765 throw new ExtractionException("Error while writing triple triple.", the);
766 } finally {
767 try {
768 output.closeContext(context);
769 } catch (TripleHandlerException e) {
770 throw new ExtractionException("Error while closing context.", e);
771 }
772 }
773
774 return context;
775 }
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794 private ExtractionContext consolidateResources(List<ResourceRoot> resourceRoots, boolean addDomainTriples,
795 TripleHandler output, String defaultLanguage) throws ExtractionException {
796 final ExtractionContext context = createExtractionContext(defaultLanguage);
797
798 try {
799 output.openContext(context);
800 } catch (TripleHandlerException e) {
801 throw new ExtractionException(
802 String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
803 }
804
805 try {
806 if (addDomainTriples) {
807 addDomainTriplesPerResourceRoots(resourceRoots, context);
808 }
809 } finally {
810 try {
811 output.closeContext(context);
812 } catch (TripleHandlerException the) {
813 throw new ExtractionException("Error while closing context.", the);
814 }
815 }
816
817 return context;
818 }
819
820
821
822
823
824
825
826
827 private void addExtractionTimeSizeMetaTriples(ExtractionContext context) throws TripleHandlerException {
828
829 String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
830 output.receiveTriple(SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
831 vSINDICE.getProperty(SINDICE.DATE), SimpleValueFactory.getInstance().createLiteral(xsdDateTimeNow),
832 null, context);
833
834
835 int numberOfTriples = 0;
836 CompositeTripleHandlerrg/apache/any23/writer/CompositeTripleHandler.html#CompositeTripleHandler">CompositeTripleHandler cth = (CompositeTripleHandler) output;
837 for (TripleHandler th : cth.getChilds()) {
838 if (th instanceof CountingTripleHandler) {
839 numberOfTriples = ((CountingTripleHandler) th).getCount();
840 }
841 }
842 output.receiveTriple(SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
843 vSINDICE.getProperty(SINDICE.SIZE), SimpleValueFactory.getInstance().createLiteral(numberOfTriples + 1),
844
845
846
847
848
849 null, context);
850 }
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866 private void createNestingRelationship(PropertyPath from, ResourceRoot to, TripleHandler th, ExtractionContext ec)
867 throws TripleHandlerException {
868 final BNode fromObject = from.getObject();
869 final String bNodeHash = from.getProperty().stringValue() + (fromObject == null ? "" : fromObject.getID());
870 BNode bnode = RDFUtils.getBNode(bNodeHash);
871 th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec);
872 th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
873 from.getObject() == null ? to.getRoot() : from.getObject(), null, ec);
874 th.receiveTriple(from.getSubject(), vSINDICE.getProperty(SINDICE.NESTING), bnode, null, ec);
875 }
876
877
878
879
880 private static class SingleExtractionReport {
881 private final Collection<IssueReport.Issue> issues;
882 private final List<ResourceRoot> resourceRoots;
883 private final List<PropertyPath> propertyPaths;
884 private final boolean touched;
885
886 public SingleExtractionReport(Collection<IssueReport.Issue> issues, List<ResourceRoot> resourceRoots,
887 List<PropertyPath> propertyPaths, boolean wasTouched) {
888 this.issues = issues;
889 this.resourceRoots = resourceRoots;
890 this.propertyPaths = propertyPaths;
891 this.touched = wasTouched;
892 }
893 }
894
895 }