1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.rdfa;
19
20 import java.io.IOException;
21 import java.net.MalformedURLException;
22 import java.net.URISyntaxException;
23 import java.net.URL;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Locale;
28 import java.util.Map;
29 import java.util.Stack;
30 import javax.xml.transform.TransformerException;
31 import org.apache.any23.extractor.ExtractionResult;
32 import org.apache.any23.extractor.IssueReport;
33 import org.apache.any23.extractor.html.DomUtils;
34 import org.apache.any23.rdf.RDFUtils;
35 import org.eclipse.rdf4j.model.IRI;
36 import org.eclipse.rdf4j.model.Literal;
37 import org.eclipse.rdf4j.model.Resource;
38 import org.eclipse.rdf4j.model.Value;
39 import org.eclipse.rdf4j.model.vocabulary.RDF;
40 import org.slf4j.Logger;
41 import org.slf4j.LoggerFactory;
42 import org.w3c.dom.Document;
43 import org.w3c.dom.NamedNodeMap;
44 import org.w3c.dom.Node;
45 import org.w3c.dom.NodeList;
46
47
48
49
50
51
52
53
54
55
56
57 @Deprecated
58 public class RDFa11Parser {
59
60 private static final Logger logger = LoggerFactory.getLogger(RDFa11Parser.class);
61
62 public static final String CURIE_SEPARATOR = ":";
63 public static final char IRI_PREFIX_SEPARATOR = ':';
64 public static final String IRI_SCHEMA_SEPARATOR = "://";
65 public static final String IRI_PATH_SEPARATOR = "/";
66
67 public static final String HEAD_TAG = "HEAD";
68 public static final String BODY_TAG = "BODY";
69
70 public static final String XMLNS_ATTRIBUTE = "xmlns";
71 public static final String XML_LANG_ATTRIBUTE = "xml:lang";
72
73 public static final String REL_ATTRIBUTE = "rel";
74 public static final String REV_ATTRIBUTE = "rev";
75
76 public static final String ABOUT_ATTRIBUTE = "about";
77 public static final String RESOURCE_ATTRIBUTE = "resource";
78 public static final String SRC_ATTRIBUTE = "src";
79 public static final String HREF_ATTRIBUTE = "href";
80
81 public static final String TYPE_ATTRIBUTE = "type";
82 public static final String ATTRIBUTE_CSS = "text/css";
83
84 public static final String[] SUBJECT_ATTRIBUTES = { ABOUT_ATTRIBUTE, SRC_ATTRIBUTE, RESOURCE_ATTRIBUTE,
85 HREF_ATTRIBUTE };
86
87 public static final String PREFIX_ATTRIBUTE = "prefix";
88 public static final String TYPEOF_ATTRIBUTE = "typeof";
89 public static final String PROPERTY_ATTRIBUTE = "property";
90 public static final String DATATYPE_ATTRIBUTE = "datatype";
91 public static final String CONTENT_ATTRIBUTE = "content";
92 public static final String VOCAB_ATTRIBUTE = "vocab";
93
94 public static final String PROFILE_ATTRIBUTE = "profile";
95
96 public static final String XML_LITERAL_DATATYPE = "rdf:XMLLiteral";
97
98 public static final String XMLNS_DEFAULT = "http://www.w3.org/1999/xhtml";
99
100 private IssueReport issueReport;
101
102 private URL documentBase;
103
104 private final Stack<IRIMapping> IRIMappingStack = new Stack<>();
105
106 private final Stack<Vocabulary> vocabularyStack = new Stack<>();
107
108 private final List<IncompleteTriple> listOfIncompleteTriples = new ArrayList<>();
109
110 private final Stack<EvaluationContext> evaluationContextStack = new Stack<>();
111
112 public RDFa11Parser() {
113
114 }
115
116 protected static URL getDocumentBase(URL documentURL, Document document) throws MalformedURLException {
117 String base;
118 base = DomUtils.find(document, "/HTML/HEAD/BASE/@href");
119 if (!"".equals(base))
120 return new URL(base);
121 base = DomUtils.find(document, "//*/h:head/h:base[position()=1]/@href"); // XHTML documents.
122 if (!"".equals(base))
123 return new URL(base);
124 return documentURL;
125 }
126
127
128
129
130
131
132
133
134
135
136 protected static String[] extractPrefixSections(String prefixesDeclaration) {
137 final String[] parts = prefixesDeclaration.split("\\s");
138 final List<String> out = new ArrayList<>();
139 int i = 0;
140 while (i < parts.length) {
141 final String part = parts[i];
142 if (part.length() == 0) {
143 i++;
144 continue;
145 }
146 if (part.charAt(part.length() - 1) == IRI_PREFIX_SEPARATOR) {
147 i++;
148 while (i < parts.length && parts[i].length() == 0)
149 i++;
150 out.add(part + (i < parts.length ? parts[i] : ""));
151 i++;
152 } else {
153 out.add(parts[i]);
154 i++;
155 }
156 }
157 return out.toArray(new String[out.size()]);
158 }
159
160 protected static boolean isAbsoluteIRI(String iri) {
161 return iri.contains(IRI_SCHEMA_SEPARATOR);
162 }
163
164 protected static boolean isCURIE(String curie) {
165 if (curie == null) {
166 throw new NullPointerException("curie string cannot be null.");
167 }
168 if (curie.trim().length() == 0)
169 return false;
170
171
172 if (curie.charAt(0) != '[' || curie.charAt(curie.length() - 1) != ']')
173 return false;
174 int separatorIndex = curie.indexOf(CURIE_SEPARATOR);
175 return separatorIndex > 0 && curie.indexOf(CURIE_SEPARATOR, separatorIndex + 1) == -1;
176 }
177
178 protected static boolean isCURIEBNode(String curie) {
179 return isCURIE(curie) && curie.substring(1, curie.length() - 1).split(CURIE_SEPARATOR)[0].equals("_");
180 }
181
182 protected static boolean isRelativeNode(Node node) {
183 if (ATTRIBUTE_CSS.equals(DomUtils.readAttribute(node, TYPE_ATTRIBUTE)))
184 return false;
185 return DomUtils.hasAttribute(node, REL_ATTRIBUTE) || DomUtils.hasAttribute(node, REV_ATTRIBUTE);
186 }
187
188
189 protected static Literal getAsPlainLiteral(Node node, String currentLanguage) {
190 final String content = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
191 if (content != null)
192 return RDFUtils.literal(content, currentLanguage);
193
194 if (!node.hasChildNodes())
195 return RDFUtils.literal("", currentLanguage);
196
197 final String nodeTextContent = node.getTextContent();
198 return nodeTextContent == null ? null : RDFUtils.literal(nodeTextContent.trim(), currentLanguage);
199 }
200
201 protected static Literal getAsXMLLiteral(Node node) throws IOException, TransformerException {
202 final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
203 if (!XML_LITERAL_DATATYPE.equals(datatype))
204 return null;
205
206 final String xmlSerializedNode = DomUtils.serializeToXML(node, false);
207 return RDFUtils.literal(xmlSerializedNode, RDF.XMLLITERAL);
208 }
209
210 protected static boolean isXMLNSDeclared(Document document) {
211 final String attributeValue = document.getDocumentElement().getAttribute(XMLNS_ATTRIBUTE);
212 if (attributeValue.length() == 0)
213 return false;
214 return XMLNS_DEFAULT.equals(attributeValue);
215 }
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230 public void processDocument(URL documentURL, Document document, ExtractionResult extractionResult)
231 throws RDFa11ParserException {
232 try {
233 this.issueReport = extractionResult;
234
235
236 if (!isXMLNSDeclared(document)) {
237 reportError(document.getDocumentElement(),
238 String.format(Locale.ROOT,
239 "The default %s namespace is expected to be declared and equal to '%s' .",
240 XMLNS_ATTRIBUTE, XMLNS_DEFAULT));
241 }
242
243 try {
244 documentBase = getDocumentBase(documentURL, document);
245 } catch (MalformedURLException murle) {
246 throw new RDFa11ParserException("Invalid document base URL.", murle);
247 }
248
249
250 pushContext(document, new EvaluationContext(documentBase));
251
252 depthFirstNode(document, extractionResult);
253
254 assert listOfIncompleteTriples
255 .isEmpty() : "The list of incomplete triples is expected to be empty at the end of processing.";
256 } finally {
257 reset();
258 }
259 }
260
261
262
263
264 public void reset() {
265 issueReport = null;
266 documentBase = null;
267 IRIMappingStack.clear();
268 listOfIncompleteTriples.clear();
269 evaluationContextStack.clear();
270 }
271
272
273
274
275
276
277
278 protected void updateVocabulary(Node currentNode) {
279 final String vocabularyStr = DomUtils.readAttribute(currentNode, VOCAB_ATTRIBUTE, null);
280 if (vocabularyStr == null)
281 return;
282 try {
283 pushVocabulary(currentNode, RDFUtils.iri(vocabularyStr));
284 } catch (Exception e) {
285 reportError(currentNode,
286 String.format(Locale.ROOT, "Invalid vocabulary [%s], must be a IRI.", vocabularyStr));
287 }
288 }
289
290
291
292
293
294
295
296 protected void updateIRIMapping(Node node) {
297 final NamedNodeMap attributes = node.getAttributes();
298 if (null == attributes)
299 return;
300
301 Node attribute;
302 final List<PrefixMap> prefixMapList = new ArrayList<PrefixMap>();
303 final String namespacePrefix = XMLNS_ATTRIBUTE + IRI_PREFIX_SEPARATOR;
304 for (int a = 0; a < attributes.getLength(); a++) {
305 attribute = attributes.item(a);
306 if (attribute.getNodeName().startsWith(namespacePrefix)) {
307 prefixMapList.add(new PrefixMap(attribute.getNodeName().substring(namespacePrefix.length()),
308 resolveIRI(attribute.getNodeValue())));
309 }
310 }
311
312 extractPrefixes(node, prefixMapList);
313
314 if (prefixMapList.size() == 0)
315 return;
316 pushMappings(node, prefixMapList);
317 }
318
319
320
321
322
323
324
325
326
327 protected IRI getMapping(String prefix) {
328 for (IRIMapping IRIMapping : IRIMappingStack) {
329 final IRI mapping = IRIMapping.map.get(prefix);
330 if (mapping != null) {
331 return mapping;
332 }
333 }
334 return null;
335 }
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352 protected IRI[] resolveCIRIeOrIRIList(Node n, String curieOrIRIList, boolean termAllowed)
353 throws URISyntaxException {
354 if (curieOrIRIList == null || curieOrIRIList.trim().length() == 0)
355 return new IRI[0];
356
357 final String[] curieOrIRIListParts = curieOrIRIList.split("\\s");
358 final List<IRI> result = new ArrayList<>();
359 Resource curieOrIRI;
360 for (String curieORIRIListPart : curieOrIRIListParts) {
361 curieOrIRI = resolveCURIEOrIRI(curieORIRIListPart, termAllowed);
362 if (curieOrIRI != null && curieOrIRI instanceof IRI) {
363 result.add((IRI) curieOrIRI);
364 } else {
365 reportError(n, String.format(Locale.ROOT, "Invalid CURIE '%s' : expected IRI, found BNode.",
366 curieORIRIListPart));
367 }
368 }
369 return result.toArray(new IRI[result.size()]);
370 }
371
372
373
374
375
376
377
378
379
380 protected IRI resolveIRI(String iriStr) {
381 return isAbsoluteIRI(iriStr) ? RDFUtils.iri(iriStr) : RDFUtils.iri(this.documentBase.toExternalForm(), iriStr);
382 }
383
384
385
386
387
388
389
390
391
392
393
394 protected Resource resolveCURIEOrIRI(String curieOrIRI, boolean termAllowed) {
395 if (isCURIE(curieOrIRI)) {
396 return resolveNamespacedIRI(curieOrIRI.substring(1, curieOrIRI.length() - 1), ResolutionPolicy.NSRequired);
397 }
398 if (isAbsoluteIRI(curieOrIRI))
399 return resolveIRI(curieOrIRI);
400 return resolveNamespacedIRI(curieOrIRI,
401 termAllowed ? ResolutionPolicy.TermAllowed : ResolutionPolicy.NSNotRequired);
402 }
403
404
405
406
407
408
409
410 private void pushContext(Node current, EvaluationContext ec) {
411 ec.node = current;
412 evaluationContextStack.push(ec);
413 }
414
415
416
417
418 private EvaluationContext getContext() {
419 return evaluationContextStack.peek();
420 }
421
422
423
424
425
426
427
428 private void popContext(Node current) {
429 final Node peekNode = evaluationContextStack.peek().node;
430 if (DomUtils.isAncestorOf(peekNode, current)) {
431 evaluationContextStack.pop();
432 }
433 }
434
435
436
437
438
439
440
441
442
443 private void pushVocabulary(Node currentNode, IRI vocab) {
444 vocabularyStack.push(new Vocabulary(currentNode, vocab));
445 }
446
447
448
449
450 private IRI getVocabulary() {
451 if (vocabularyStack.isEmpty())
452 return null;
453 return vocabularyStack.peek().prefix;
454 }
455
456
457
458
459
460
461 private void popVocabulary(Node current) {
462 if (vocabularyStack.isEmpty())
463 return;
464 if (DomUtils.isAncestorOf(current, vocabularyStack.peek().originatingNode)) {
465 vocabularyStack.pop();
466 }
467 }
468
469
470
471
472
473
474 private void purgeIncompleteTriples(Node current) {
475 final List<IncompleteTriple> toBePurged = new ArrayList<>();
476 for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
477 if (DomUtils.isAncestorOf(current, incompleteTriple.originatingNode, true)) {
478 toBePurged.add(incompleteTriple);
479 }
480 }
481 listOfIncompleteTriples.removeAll(toBePurged);
482 toBePurged.clear();
483 }
484
485
486
487
488
489
490
491
492
493 private void reportError(Node n, String msg) {
494 final String errorMsg = String.format(Locale.ROOT, "Error while processing node [%s] : '%s'",
495 DomUtils.getXPathForNode(n), msg);
496 final int[] errorLocation = DomUtils.getNodeLocation(n);
497 this.issueReport.notifyIssue(IssueReport.IssueLevel.WARNING, errorMsg,
498 errorLocation == null ? -1 : errorLocation[0], errorLocation == null ? -1 : errorLocation[1]);
499 }
500
501
502
503
504
505
506
507
508 private void depthFirstNode(Node node, ExtractionResult extractionResult) {
509 try {
510 processNode(node, extractionResult);
511 } catch (Exception e) {
512 if (logger.isDebugEnabled())
513 logger.debug("Error while processing node.", e);
514 reportError(node, e.getMessage());
515 }
516 depthFirstChildren(node.getChildNodes(), extractionResult);
517 purgeIncompleteTriples(node);
518 }
519
520
521
522
523
524
525
526 private void depthFirstChildren(NodeList nodeList, ExtractionResult extractionResult) {
527 for (int i = 0; i < nodeList.getLength(); i++) {
528 final Node child = nodeList.item(i);
529 depthFirstNode(child, extractionResult);
530 popMappings(child);
531 popVocabulary(child);
532 popContext(child);
533 }
534 }
535
536
537
538
539
540
541
542
543
544 private void writeTriple(Resource s, IRI p, Value o, ExtractionResult extractionResult) {
545 assert s != null : "subject is null.";
546 assert p != null : "predicate is null.";
547 assert o != null : "object is null.";
548 extractionResult.writeTriple(s, p, o);
549 }
550
551
552
553
554
555
556
557
558
559
560
561
562 private void processNode(Node currentElement, ExtractionResult extractionResult) throws Exception {
563 final EvaluationContext currentEvaluationContext = getContext();
564 try {
565 if (currentElement.getNodeType() != Node.DOCUMENT_NODE && currentElement.getNodeType() != Node.ELEMENT_NODE)
566 return;
567
568
569 updateVocabulary(currentElement);
570
571
572
573 updateIRIMapping(currentElement);
574
575
576 updateLanguage(currentElement, currentEvaluationContext);
577
578 if (!isRelativeNode(currentElement)) {
579
580 establishNewSubject(currentElement, currentEvaluationContext);
581 } else {
582
583 establishNewSubjectCurrentObjectResource(currentElement, currentEvaluationContext);
584 }
585
586
587
588
589
590
591 if (currentEvaluationContext.newSubject == null)
592 return;
593 if (logger.isDebugEnabled())
594 logger.debug("newSubject: " + currentEvaluationContext.newSubject);
595
596
597 final IRI[] types = getTypes(currentElement);
598 for (IRI type : types) {
599 writeTriple(currentEvaluationContext.newSubject, RDF.TYPE, type, extractionResult);
600 }
601
602
603 final IRI[] rels = getRels(currentElement);
604 final IRI[] revs = getRevs(currentElement);
605 if (currentEvaluationContext.currentObjectResource != null) {
606 for (IRI rel : rels) {
607 writeTriple(currentEvaluationContext.newSubject, rel,
608 currentEvaluationContext.currentObjectResource, extractionResult);
609 }
610 for (IRI rev : revs) {
611 writeTriple(currentEvaluationContext.currentObjectResource, rev,
612 currentEvaluationContext.newSubject, extractionResult);
613 }
614 } else {
615 for (IRI rel : rels) {
616 listOfIncompleteTriples.add(new IncompleteTriple(currentElement,
617 currentEvaluationContext.newSubject, rel, IncompleteTripleDirection.Forward));
618 }
619 for (IRI rev : revs) {
620 listOfIncompleteTriples.add(new IncompleteTriple(currentElement,
621 currentEvaluationContext.newSubject, rev, IncompleteTripleDirection.Reverse));
622 }
623 }
624
625
626 final Value currentObject = getCurrentObject(currentElement);
627 final IRI[] predicates = getPredicate(currentElement);
628 if (currentObject != null && predicates != null) {
629 for (IRI predicate : predicates) {
630 writeTriple(currentEvaluationContext.newSubject, predicate, currentObject, extractionResult);
631 }
632 }
633
634
635 if (!currentEvaluationContext.skipElem && currentEvaluationContext.newSubject != null) {
636 for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
637 incompleteTriple.produceTriple(currentElement, currentEvaluationContext.newSubject,
638 extractionResult);
639 }
640 }
641 } catch (Exception e) {
642 throw e;
643 } finally {
644
645 if (currentEvaluationContext.recourse) {
646 EvaluationContext newEvaluationContext = new EvaluationContext(currentEvaluationContext.base);
647 if (currentEvaluationContext.skipElem) {
648 newEvaluationContext.language = currentEvaluationContext.language;
649 } else {
650 newEvaluationContext.base = currentEvaluationContext.base;
651
652 if (currentEvaluationContext.newSubject != null) {
653 newEvaluationContext.parentSubject = currentEvaluationContext.newSubject;
654 } else {
655 newEvaluationContext.parentSubject = currentEvaluationContext.parentSubject;
656 }
657
658 if (currentEvaluationContext.currentObjectResource != null) {
659 newEvaluationContext.parentObject = currentEvaluationContext.currentObjectResource;
660 } else if (currentEvaluationContext.newSubject != null) {
661 newEvaluationContext.parentObject = currentEvaluationContext.newSubject;
662 } else {
663 newEvaluationContext.parentObject = currentEvaluationContext.parentSubject;
664 }
665
666 newEvaluationContext.language = currentEvaluationContext.language;
667 }
668 pushContext(currentElement, newEvaluationContext);
669 }
670 }
671 }
672
673
674
675
676
677
678
679 private void extractPrefixes(Node node, List<PrefixMap> prefixMapList) {
680 final String prefixAttribute = DomUtils.readAttribute(node, PREFIX_ATTRIBUTE, null);
681 if (prefixAttribute == null)
682 return;
683 final String[] prefixParts = extractPrefixSections(prefixAttribute);
684 for (String prefixPart : prefixParts) {
685 int splitPoint = prefixPart.indexOf(IRI_PREFIX_SEPARATOR);
686 final String prefix = prefixPart.substring(0, splitPoint);
687 if (prefix.length() == 0) {
688 reportError(node,
689 String.format(Locale.ROOT, "Invalid prefix length in prefix attribute '%s'", prefixAttribute));
690 continue;
691 }
692 final IRI iri;
693 final String iriStr = prefixPart.substring(splitPoint + 1);
694 try {
695 iri = resolveIRI(iriStr);
696 } catch (Exception e) {
697 reportError(node, String.format(Locale.ROOT, "Resolution of prefix '%s' defines an invalid IRI: '%s'",
698 prefixAttribute, iriStr));
699 continue;
700 }
701 prefixMapList.add(new PrefixMap(prefix, iri));
702 }
703 }
704
705
706
707
708
709
710
711 private void updateLanguage(Node node, EvaluationContext currentEvaluationContext) {
712 final String candidateLanguage = DomUtils.readAttribute(node, XML_LANG_ATTRIBUTE, null);
713 if (candidateLanguage != null)
714 currentEvaluationContext.language = candidateLanguage;
715 }
716
717
718
719
720
721
722
723
724
725
726 private void establishNewSubject(Node node, EvaluationContext currentEvaluationContext) throws URISyntaxException {
727 String candidateIRIOrCURIE;
728 for (String subjectAttribute : SUBJECT_ATTRIBUTES) {
729 candidateIRIOrCURIE = DomUtils.readAttribute(node, subjectAttribute, null);
730 if (candidateIRIOrCURIE != null) {
731 currentEvaluationContext.newSubject = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
732 return;
733 }
734 }
735
736 if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
737 currentEvaluationContext.newSubject = resolveIRI(currentEvaluationContext.base.toString());
738 return;
739 }
740
741 if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
742 currentEvaluationContext.newSubject = RDFUtils.bnode();
743 return;
744 }
745
746 if (DomUtils.hasAttribute(node, PROPERTY_ATTRIBUTE)) {
747 currentEvaluationContext.skipElem = true;
748 }
749 if (currentEvaluationContext.parentObject != null) {
750 currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
751 return;
752 }
753
754 currentEvaluationContext.newSubject = null;
755 }
756
757
758
759
760
761
762
763
764
765
766
767 private void establishNewSubjectCurrentObjectResource(Node node, EvaluationContext currentEvaluationContext)
768 throws URISyntaxException {
769
770 String candidateIRIOrCURIE;
771 candidateIRIOrCURIE = DomUtils.readAttribute(node, ABOUT_ATTRIBUTE, null);
772 if (candidateIRIOrCURIE != null) {
773 currentEvaluationContext.newSubject = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
774 } else {
775 candidateIRIOrCURIE = DomUtils.readAttribute(node, SRC_ATTRIBUTE, null);
776 if (candidateIRIOrCURIE != null) {
777 currentEvaluationContext.newSubject = resolveIRI(candidateIRIOrCURIE);
778 } else {
779 if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
780 currentEvaluationContext.newSubject = resolveIRI(currentEvaluationContext.base.toString());
781 } else {
782 if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
783 currentEvaluationContext.newSubject = RDFUtils.bnode();
784 } else {
785 if (currentEvaluationContext.parentObject != null) {
786 currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
787 }
788 }
789 }
790 }
791 }
792
793
794 candidateIRIOrCURIE = DomUtils.readAttribute(node, RESOURCE_ATTRIBUTE, null);
795 if (candidateIRIOrCURIE != null) {
796 currentEvaluationContext.currentObjectResource = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
797 return;
798 }
799
800 candidateIRIOrCURIE = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
801 if (candidateIRIOrCURIE != null) {
802 currentEvaluationContext.currentObjectResource = resolveIRI(candidateIRIOrCURIE);
803 return;
804 }
805 currentEvaluationContext.currentObjectResource = null;
806 }
807
808 private IRI[] getTypes(Node node) throws URISyntaxException {
809 final String typeOf = DomUtils.readAttribute(node, TYPEOF_ATTRIBUTE, null);
810 return resolveCIRIeOrIRIList(node, typeOf, true);
811 }
812
813 private IRI[] getRels(Node node) throws URISyntaxException {
814 final String rel = DomUtils.readAttribute(node, REL_ATTRIBUTE, null);
815 return resolveCIRIeOrIRIList(node, rel, true);
816 }
817
818 private IRI[] getRevs(Node node) throws URISyntaxException {
819 final String rev = DomUtils.readAttribute(node, REV_ATTRIBUTE, null);
820 return resolveCIRIeOrIRIList(node, rev, true);
821 }
822
823 private IRI[] getPredicate(Node node) throws URISyntaxException {
824 final String candidateIRI = DomUtils.readAttribute(node, PROPERTY_ATTRIBUTE, null);
825 if (candidateIRI == null)
826 return null;
827 return resolveCIRIeOrIRIList(node, candidateIRI, true);
828 }
829
830
831
832
833
834
835
836
837
838
839
840
841
842 private Value getCurrentObject(Node node) throws URISyntaxException, IOException, TransformerException {
843 final String candidateObject = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
844 if (candidateObject != null) {
845 return resolveIRI(candidateObject);
846 } else {
847 return gerCurrentObjectLiteral(node);
848 }
849 }
850
851 private Literal gerCurrentObjectLiteral(Node node) throws URISyntaxException, IOException, TransformerException {
852 final EvaluationContext currentEvaluationContext = getContext();
853 Literal literal;
854
855 literal = getAsTypedLiteral(node);
856 if (literal != null)
857 return literal;
858
859 literal = getAsXMLLiteral(node);
860 if (literal != null) {
861 currentEvaluationContext.recourse = false;
862 return literal;
863 }
864
865 literal = getAsPlainLiteral(node, currentEvaluationContext.language);
866 if (literal != null)
867 return literal;
868
869 return null;
870 }
871
872 private static String getNodeContent(Node node) {
873 final String candidateContent = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
874 if (candidateContent != null)
875 return candidateContent;
876 return node.getTextContent();
877 }
878
879
880
881
882
883
884
885
886
887
888 private Literal getAsTypedLiteral(Node node) throws URISyntaxException {
889 final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
890 if (datatype == null || datatype.trim().length() == 0 || XML_LITERAL_DATATYPE.equals(datatype.trim())) {
891 return null;
892 }
893 final Resource curieOrIRI = resolveCURIEOrIRI(datatype, true);
894 return RDFUtils.literal(getNodeContent(node), curieOrIRI instanceof IRI ? (IRI) curieOrIRI : null);
895 }
896
897 private void pushMappings(Node sourceNode, List<PrefixMap> prefixMapList) {
898
899 final Map<String, IRI> mapping = new HashMap<>();
900 for (PrefixMap prefixMap : prefixMapList) {
901 mapping.put(prefixMap.prefix, prefixMap.IRI);
902 }
903 IRIMappingStack.push(new IRIMapping(sourceNode, mapping));
904 }
905
906 private void popMappings(Node node) {
907 if (IRIMappingStack.isEmpty())
908 return;
909 final IRIMapping peek = IRIMappingStack.peek();
910 if (!DomUtils.isAncestorOf(peek.sourceNode, node)) {
911 IRIMappingStack.pop();
912 }
913 }
914
915
916
917
918
919
920
921
922
923
924 private Resource resolveNamespacedIRI(String mapping, ResolutionPolicy resolutionPolicy) {
925 if (mapping.indexOf(IRI_PATH_SEPARATOR) == 0) {
926 mapping = mapping.substring(1);
927 }
928
929 final int prefixSeparatorIndex = mapping.indexOf(':');
930 if (prefixSeparatorIndex == -1) {
931 if (resolutionPolicy == ResolutionPolicy.NSRequired) {
932 throw new IllegalArgumentException(
933 String.format(Locale.ROOT, "Invalid mapping string [%s], must declare a prefix.", mapping));
934 }
935 if (resolutionPolicy == ResolutionPolicy.TermAllowed) {
936 final IRI currentVocabulary = getVocabulary();
937
938 if (currentVocabulary != null) {
939 return resolveIRI(currentVocabulary.toString() + mapping);
940 }
941 }
942 return resolveIRI(documentBase.toString() + mapping);
943 }
944
945 final String prefix = mapping.substring(0, prefixSeparatorIndex);
946 final IRI curieMapping = getMapping(prefix);
947 if (curieMapping == null) {
948 throw new IllegalArgumentException(String.format(Locale.ROOT, "Cannot map prefix '%s'", prefix));
949 }
950 final String candidateCURIEStr = curieMapping.toString() + mapping.substring(prefixSeparatorIndex + 1);
951 final java.net.URI candidateCURIE;
952 try {
953 candidateCURIE = new java.net.URI(candidateCURIEStr);
954 } catch (URISyntaxException IRIse) {
955 throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid CURIE '%s'", candidateCURIEStr));
956 }
957 return resolveIRI(candidateCURIE.isAbsolute() ? candidateCURIE.toString()
958 : documentBase.toString() + candidateCURIE.toString());
959 }
960
961
962
963
964 enum ResolutionPolicy {
965 NSNotRequired, NSRequired, TermAllowed
966 }
967
968
969
970
971 private class EvaluationContext {
972 private Node node;
973 private URL base;
974 private Resource parentSubject;
975 private Value parentObject;
976 private String language;
977 private boolean recourse;
978 private boolean skipElem;
979 private Resource newSubject;
980 private Resource currentObjectResource;
981
982
983
984
985
986
987 EvaluationContext(URL base) {
988 this.base = base;
989 this.parentSubject = resolveIRI(base.toExternalForm());
990 this.parentObject = null;
991 this.language = null;
992 this.recourse = true;
993 this.skipElem = false;
994 this.newSubject = null;
995 this.currentObjectResource = null;
996 }
997 }
998
999
1000
1001
1002 private static class PrefixMap {
1003 final String prefix;
1004 final IRI IRI;
1005
1006 public PrefixMap(String prefix, IRI IRI) {
1007 this.prefix = prefix;
1008 this.IRI = IRI;
1009 }
1010 }
1011
1012
1013
1014
1015 private static class IRIMapping {
1016 final Node sourceNode;
1017 final Map<String, IRI> map;
1018
1019 public IRIMapping(Node sourceNode, Map<String, IRI> map) {
1020 this.sourceNode = sourceNode;
1021 this.map = map;
1022 }
1023 }
1024
1025
1026
1027
1028 private enum IncompleteTripleDirection {
1029 Forward, Reverse
1030 }
1031
1032
1033
1034
1035 private static class IncompleteTriple {
1036 final Node originatingNode;
1037 final Resource subject;
1038 final IRI predicate;
1039 final IncompleteTripleDirection direction;
1040
1041 public IncompleteTriple(Node originatingNode, Resource subject, IRI predicate,
1042 IncompleteTripleDirection direction) {
1043 if (originatingNode == null || subject == null || predicate == null || direction == null)
1044 throw new IllegalArgumentException();
1045
1046 this.originatingNode = originatingNode;
1047 this.subject = subject;
1048 this.predicate = predicate;
1049 this.direction = direction;
1050 }
1051
1052 public boolean produceTriple(Node resourceNode, Resource r, ExtractionResult extractionResult) {
1053 if (!DomUtils.isAncestorOf(originatingNode, resourceNode, true))
1054 return false;
1055
1056 if (r == null)
1057 throw new IllegalArgumentException();
1058 switch (direction) {
1059 case Forward:
1060 extractionResult.writeTriple(subject, predicate, r);
1061 break;
1062 case Reverse:
1063 extractionResult.writeTriple(r, predicate, subject);
1064 break;
1065 default:
1066 throw new IllegalStateException();
1067 }
1068 return true;
1069 }
1070 }
1071
1072
1073
1074
1075 private static class Vocabulary {
1076 final Node originatingNode;
1077 final IRI prefix;
1078
1079 public Vocabulary(Node originatingNode, IRI prefix) {
1080 this.originatingNode = originatingNode;
1081 this.prefix = prefix;
1082 }
1083 }
1084
1085 }