1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.any23.extractor.microdata;
18
19 import org.apache.any23.extractor.html.DomUtils;
20 import org.apache.any23.rdf.RDFUtils;
21 import org.apache.commons.lang3.StringUtils;
22 import org.eclipse.rdf4j.model.IRI;
23 import org.eclipse.rdf4j.model.Literal;
24 import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil;
25 import org.eclipse.rdf4j.model.vocabulary.XSD;
26 import org.jsoup.parser.Tag;
27 import org.w3c.dom.Document;
28 import org.w3c.dom.Element;
29 import org.w3c.dom.NamedNodeMap;
30 import org.w3c.dom.Node;
31 import org.w3c.dom.NodeList;
32 import org.w3c.dom.traversal.DocumentTraversal;
33 import org.w3c.dom.traversal.NodeFilter;
34 import org.w3c.dom.traversal.TreeWalker;
35
36 import java.io.PrintStream;
37 import java.util.ArrayList;
38 import java.util.Arrays;
39 import java.util.Collections;
40 import java.util.HashMap;
41 import java.util.HashSet;
42 import java.util.LinkedHashSet;
43 import java.util.List;
44 import java.util.Locale;
45 import java.util.Map;
46 import java.util.Set;
47 import java.util.stream.Collectors;
48
49
50
51
52
53
54
55 public class MicrodataParser {
56
57 enum ErrorMode {
58
59 STOP_AT_FIRST_ERROR,
60
61 FULL_REPORT
62 }
63
64 private final Document document;
65
66
67
68
69
70
71
72 private final Set<String> loopDetectorSet = new HashSet<>();
73
74
75
76
77 private final Map<Node, ItemScope> itemScopes = new HashMap<>();
78
79
80
81
82 private final Map<Node, ItemPropValue> itemPropValues = new HashMap<>();
83
84
85
86
87
88 private int dereferenceRecursionCounter = 0;
89
90
91
92
93 private ErrorMode errorMode = ErrorMode.FULL_REPORT;
94
95
96
97
98 private final List<MicrodataParserException> errors = new ArrayList<>();
99
100 public static final String ITEMSCOPE_ATTRIBUTE = "itemscope";
101 public static final String ITEMPROP_ATTRIBUTE = "itemprop";
102 private static final String REVERSE_ITEMPROP_ATTRIBUTE = "itemprop-reverse";
103
104
105
106
107 public static final Set<String> SRC_TAGS = Collections.unmodifiableSet(new HashSet<String>(Arrays.asList("audio",
108 "embed", "frame", "iframe", "img", "source", "track", "video", "input", "layer", "script", "textarea")));
109
110
111
112
113 public static final Set<String> HREF_TAGS = Collections
114 .unmodifiableSet(new HashSet<String>(Arrays.asList("a", "area", "link")));
115
116 public MicrodataParser(Document document) {
117 if (document == null) {
118 throw new NullPointerException("Document cannot be null.");
119 }
120 this.document = document;
121 }
122
123
124
125
126
127
128
129
130
131 public static List<Node> getItemScopeNodes(Node node) {
132 return DomUtils.findAllByAttributeName(node, ITEMSCOPE_ATTRIBUTE);
133 }
134
135
136
137
138
139
140
141
142
143 public static boolean isItemScope(Node node) {
144 return DomUtils.readAttribute(node, ITEMSCOPE_ATTRIBUTE, null) != null;
145 }
146
147
148
149
150
151
152
153
154
155 public static List<Node> getItemPropNodes(Node node) {
156 return DomUtils.findAllByAttributeName(node, ITEMPROP_ATTRIBUTE);
157 }
158
159
160
161
162
163
164
165
166
167 public static boolean isItemProp(Node node) {
168 return DomUtils.readAttribute(node, ITEMPROP_ATTRIBUTE, null) != null;
169 }
170
171 private static boolean isContainedInItemScope(Node node) {
172 for (Node p = node.getParentNode(); p != null; p = p.getParentNode()) {
173 NamedNodeMap attrs = p.getAttributes();
174 if (attrs != null && attrs.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
175 return true;
176 }
177 }
178 return false;
179 }
180
181 private static boolean isContainedInId(Node node, Set<String> ids) {
182 do {
183 String id = DomUtils.readAttribute(node, "id", null);
184 if (id != null && ids.contains(id)) {
185 return true;
186 }
187 node = node.getParentNode();
188 } while (node != null);
189 return false;
190 }
191
192
193
194
195
196
197
198
199
200 public static List<Node> getTopLevelItemScopeNodes(Node node) {
201 final List<Node> itemScopes = getItemScopeNodes(node);
202 final List<Node> topLevelItemScopes = new ArrayList<>();
203 final List<Node> possibles = new ArrayList<>();
204 for (Node itemScope : itemScopes) {
205 if (!isItemProp(itemScope) && DomUtils.readAttribute(itemScope, REVERSE_ITEMPROP_ATTRIBUTE, null) == null) {
206 topLevelItemScopes.add(itemScope);
207 } else if (!isContainedInItemScope(itemScope)) {
208 possibles.add(itemScope);
209 }
210 }
211
212 if (!possibles.isEmpty()) {
213 Set<String> refIds = itemScopes.stream().flatMap(n -> Arrays.stream(itemrefIds(n)))
214 .collect(Collectors.toSet());
215
216 for (Node itemScope : possibles) {
217 if (!isContainedInId(itemScope, refIds)) {
218 topLevelItemScopes.add(itemScope);
219 }
220 }
221 }
222
223 return topLevelItemScopes;
224 }
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241 public static MicrodataParserReport getMicrodata(Document document, ErrorMode errorMode)
242 throws MicrodataParserException {
243 final List<Node> itemNodes = getTopLevelItemScopeNodes(document);
244 final List<ItemScope> items = new ArrayList<>();
245 final MicrodataParser microdataParser = new MicrodataParser(document);
246 microdataParser.setErrorMode(errorMode);
247 for (Node itemNode : itemNodes) {
248 items.add(microdataParser.getItemScope(itemNode));
249 }
250 return new MicrodataParserReport(items.toArray(new ItemScope[items.size()]), microdataParser.getErrors());
251 }
252
253
254
255
256
257
258
259
260
261
262 public static MicrodataParserReport getMicrodata(Document document) {
263 try {
264 return getMicrodata(document, ErrorMode.FULL_REPORT);
265 } catch (MicrodataParserException mpe) {
266 throw new IllegalStateException("Unexpected exception.", mpe);
267 }
268 }
269
270
271
272
273
274
275
276
277
278
279 public static void getMicrodataAsJSON(Document document, PrintStream ps) {
280 final MicrodataParserReport report = getMicrodata(document);
281 final ItemScope[] itemScopes = report.getDetectedItemScopes();
282 final MicrodataParserException[] errors = report.getErrors();
283
284 ps.append("{ ");
285
286
287 ps.append("\"result\" : [");
288 for (int i = 0; i < itemScopes.length; i++) {
289 if (i > 0) {
290 ps.print(", ");
291 }
292 ps.print(itemScopes[i].toJSON());
293 }
294 ps.append("] ");
295
296
297 if (errors != null && errors.length > 0) {
298 ps.append(", ");
299 ps.append("\"errors\" : [");
300 for (int i = 0; i < errors.length; i++) {
301 if (i > 0) {
302 ps.print(", ");
303 }
304 ps.print(errors[i].toJSON());
305 }
306 ps.append("] ");
307 }
308
309 ps.append("}");
310 }
311
312 public void setErrorMode(ErrorMode errorMode) {
313 if (errorMode == null)
314 throw new IllegalArgumentException("errorMode must be not null.");
315 this.errorMode = errorMode;
316 }
317
318 public ErrorMode getErrorMode() {
319 return this.errorMode;
320 }
321
322 public MicrodataParserException[] getErrors() {
323 return errors == null ? new MicrodataParserException[0]
324 : errors.toArray(new MicrodataParserException[errors.size()]);
325 }
326
327
328
329
330
331
332
333
334
335
336
337
338 public ItemPropValue getPropertyValue(Node node) throws MicrodataParserException {
339 final ItemPropValue itemPropValue = itemPropValues.get(node);
340 if (itemPropValue != null)
341 return itemPropValue;
342
343 if (isItemScope(node)) {
344 return new ItemPropValue(getItemScope(node), ItemPropValue.Type.Nested);
345 }
346
347 final String nodeName = node.getNodeName().toLowerCase(Locale.ROOT);
348
349
350 if ("data".equals(nodeName) || "meter".equals(nodeName)) {
351 String value = value(node, "value");
352 Literal l;
353 if (XMLDatatypeUtil.isValidInteger(value)) {
354 l = RDFUtils.literal(value, XSD.INTEGER);
355 } else if (XMLDatatypeUtil.isValidDouble(value)) {
356 l = RDFUtils.literal(value, XSD.DOUBLE);
357 } else {
358 l = RDFUtils.literal(value);
359 }
360 return new ItemPropValue(l);
361 }
362 if ("time".equals(nodeName)) {
363 String dateTimeStr = value(node, "datetime");
364 Literal l;
365 if (XMLDatatypeUtil.isValidDate(dateTimeStr)) {
366 l = RDFUtils.literal(dateTimeStr, XSD.DATE);
367 } else if (XMLDatatypeUtil.isValidTime(dateTimeStr)) {
368 l = RDFUtils.literal(dateTimeStr, XSD.TIME);
369 } else if (XMLDatatypeUtil.isValidDateTime(dateTimeStr)) {
370 l = RDFUtils.literal(dateTimeStr, XSD.DATETIME);
371 } else if (XMLDatatypeUtil.isValidGYearMonth(dateTimeStr)) {
372 l = RDFUtils.literal(dateTimeStr, XSD.GYEARMONTH);
373 } else if (XMLDatatypeUtil.isValidGYear(dateTimeStr)) {
374 l = RDFUtils.literal(dateTimeStr, XSD.GYEAR);
375 } else if (XMLDatatypeUtil.isValidDuration(dateTimeStr)) {
376 l = RDFUtils.literal(dateTimeStr, XSD.DURATION);
377 } else {
378 l = RDFUtils.literal(dateTimeStr, getLanguage(node));
379 }
380 return new ItemPropValue(l);
381 }
382
383 if (SRC_TAGS.contains(nodeName)) {
384 return link(node, "src");
385 }
386 if (HREF_TAGS.contains(nodeName)) {
387 return link(node, "href");
388 }
389
390 if ("object".equals(nodeName)) {
391 return link(node, "data");
392 }
393
394 String val = DomUtils.readAttribute(node, "content", null);
395 if (val != null) {
396 return new ItemPropValue(RDFUtils.literal(val, getLanguage(node)));
397 }
398
399 Literal l = RDFUtils.literal(textContent(node), getLanguage(node));
400 final ItemPropValue newItemPropValue = new ItemPropValue(l);
401 itemPropValues.put(node, newItemPropValue);
402 return newItemPropValue;
403 }
404
405 private static String textContent(Node node) {
406 StringBuilder content = new StringBuilder();
407 appendFormatted(node, content, false);
408 return content.toString();
409 }
410
411 private static boolean shouldSeparateWithNewline(CharSequence s0, CharSequence s1) {
412 for (int i = 0, len = s1.length(); i < len; i++) {
413 char ch = s1.charAt(i);
414 if (ch == '\n' || ch == '\r') {
415 return false;
416 }
417 if (!Character.isWhitespace(ch)) {
418 break;
419 }
420 }
421 for (int i = s0.length() - 1; i >= 0; i--) {
422 char ch = s0.charAt(i);
423 if (ch == '\n' || ch == '\r') {
424 return false;
425 }
426 if (!Character.isWhitespace(ch)) {
427 return true;
428 }
429 }
430 return false;
431 }
432
433 private static boolean appendFormatted(Node node, StringBuilder sb, boolean needsNewline) {
434 switch (node.getNodeType()) {
435 case Node.TEXT_NODE:
436 String text = node.getTextContent();
437 if (text.isEmpty()) {
438 return needsNewline;
439 }
440 if (needsNewline && shouldSeparateWithNewline(sb, text)) {
441 sb.append('\n');
442 }
443 sb.append(text);
444 return false;
445 case Node.ELEMENT_NODE:
446 final String nodeName = node.getNodeName().toLowerCase(Locale.ENGLISH);
447 final boolean thisNeedsNewline = "br".equals(nodeName) || Tag.valueOf(nodeName).isBlock();
448 final NodeList children = node.getChildNodes();
449 boolean prevChildNeedsNewline = needsNewline || thisNeedsNewline;
450 for (int i = 0, len = children.getLength(); i < len; i++) {
451 prevChildNeedsNewline = appendFormatted(children.item(i), sb, prevChildNeedsNewline);
452 }
453 return prevChildNeedsNewline || thisNeedsNewline;
454 default:
455 return needsNewline;
456 }
457 }
458
459 private static String content(Node node, String attrName) {
460 NamedNodeMap attributes = node.getAttributes();
461 if (attributes != null) {
462 Node attr = attributes.getNamedItem("content");
463 if (attr != null) {
464 return attr.getNodeValue();
465 }
466 attr = attributes.getNamedItem(attrName);
467 if (attr != null) {
468 return attr.getNodeValue();
469 }
470 }
471 return null;
472 }
473
474 private static String value(Node node, String attrName) {
475 String content = content(node, attrName);
476 return StringUtils.stripToEmpty(content != null ? content : node.getTextContent());
477 }
478
479 private static ItemPropValue link(Node node, String attrName) {
480 String content = content(node, attrName);
481 return content == null ? new ItemPropValue(RDFUtils.literal(""))
482 : new ItemPropValue(content, ItemPropValue.Type.Link);
483 }
484
485
486 private static String getLanguage(Node node) {
487 String lang;
488 do {
489 lang = DomUtils.readAttribute(node, "xml:lang", null);
490 if (StringUtils.isNotBlank(lang)) {
491 return lang.trim();
492 }
493 lang = DomUtils.readAttribute(node, "lang", null);
494 if (StringUtils.isNotBlank(lang)) {
495 return lang.trim();
496 }
497 node = node.getParentNode();
498 } while (node != null);
499 return null;
500 }
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 public List<ItemProp> getItemProps(final Node scopeNode, boolean skipRoot) throws MicrodataParserException {
517 final Set<Node> accepted = new LinkedHashSet<>();
518
519 boolean skipRootChildren = false;
520 if (!skipRoot) {
521 NamedNodeMap attributes = scopeNode.getAttributes();
522 if (attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null
523 || attributes.getNamedItem(REVERSE_ITEMPROP_ATTRIBUTE) != null) {
524 accepted.add(scopeNode);
525 }
526 if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
527 skipRootChildren = true;
528 }
529 }
530
531 if (!skipRootChildren) {
532
533 TreeWalker treeWalker = ((DocumentTraversal) scopeNode.getOwnerDocument()).createTreeWalker(scopeNode,
534 NodeFilter.SHOW_ELEMENT, new NodeFilter() {
535 @Override
536 public short acceptNode(Node node) {
537 if (node.getNodeType() == Node.ELEMENT_NODE) {
538 NamedNodeMap attributes = node.getAttributes();
539 if ((attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null
540 || attributes.getNamedItem(REVERSE_ITEMPROP_ATTRIBUTE) != null)
541 && scopeNode != node) {
542 accepted.add(node);
543 }
544
545 if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
546
547 return FILTER_REJECT;
548 }
549 }
550 return FILTER_ACCEPT;
551 }
552 }, false);
553
554
555 while (treeWalker.nextNode() != null)
556 ;
557 }
558
559 final List<ItemProp> result = new ArrayList<>();
560 for (Node itemPropNode : accepted) {
561 final String itemProp = DomUtils.readAttribute(itemPropNode, ITEMPROP_ATTRIBUTE, null);
562 final String reverseProp = DomUtils.readAttribute(itemPropNode, REVERSE_ITEMPROP_ATTRIBUTE, null);
563
564 boolean hasItemProp = StringUtils.isNotBlank(itemProp);
565 boolean hasReverseProp = StringUtils.isNotBlank(reverseProp);
566
567 if (!hasItemProp && !hasReverseProp) {
568 manageError(new MicrodataParserException("invalid property name '" + itemProp + "'", itemPropNode));
569 continue;
570 }
571
572 ItemPropValue itemPropValue;
573 try {
574 itemPropValue = getPropertyValue(itemPropNode);
575 } catch (MicrodataParserException mpe) {
576 manageError(mpe);
577 continue;
578 }
579 if (hasItemProp) {
580 for (String propertyName : itemProp.trim().split("\\s+")) {
581 result.add(
582 new ItemProp(DomUtils.getXPathForNode(itemPropNode), propertyName, itemPropValue, false));
583 }
584 }
585 if (hasReverseProp) {
586 if (itemPropValue.literal != null) {
587 manageError(new MicrodataParserException(REVERSE_ITEMPROP_ATTRIBUTE + " cannot point to a literal",
588 itemPropNode));
589 continue;
590 }
591 for (String propertyName : reverseProp.trim().split("\\s+")) {
592 result.add(new ItemProp(DomUtils.getXPathForNode(itemPropNode), propertyName, itemPropValue, true));
593 }
594 }
595 }
596 return result;
597 }
598
599
600
601
602
603
604
605
606
607
608
609
610 public ItemProp[] deferProperties(String... refs) throws MicrodataParserException {
611 Document document = this.document;
612 dereferenceRecursionCounter++;
613 final List<ItemProp> result = new ArrayList<>();
614 try {
615 for (String ref : refs) {
616 if (loopDetectorSet.contains(ref)) {
617 throw new MicrodataParserException(String.format(Locale.ROOT,
618 "Loop detected with depth %d while dereferencing itemProp '%s' .",
619 dereferenceRecursionCounter - 1, ref), null);
620 }
621 loopDetectorSet.add(ref);
622 Element element = document.getElementById(ref);
623 if (element == null) {
624 manageError(new MicrodataParserException(
625 String.format(Locale.ROOT, "Unknown itemProp id '%s'", ref), null));
626 continue;
627 }
628 result.addAll(getItemProps(element, false));
629 }
630 } catch (MicrodataParserException mpe) {
631 if (dereferenceRecursionCounter == 1)
632 manageError(mpe);
633 else
634 throw mpe;
635 } finally {
636 dereferenceRecursionCounter--;
637 if (dereferenceRecursionCounter == 0) {
638 loopDetectorSet.clear();
639 }
640 }
641 return result.toArray(new ItemProp[result.size()]);
642 }
643
644 private static final String[] EMPTY_STRINGS = new String[0];
645
646 private static String[] itemrefIds(Node node) {
647 String itemref = DomUtils.readAttribute(node, "itemref", null);
648 return StringUtils.isBlank(itemref) ? EMPTY_STRINGS : itemref.trim().split("\\s+");
649 }
650
651
652
653
654
655
656
657
658
659
660
661
662 public ItemScope getItemScope(Node node) throws MicrodataParserException {
663 final ItemScope itemScope = itemScopes.get(node);
664 if (itemScope != null)
665 return itemScope;
666
667 final String id = DomUtils.readAttribute(node, "id", null);
668 final String itemType = DomUtils.readAttribute(node, "itemtype", null);
669 final String itemId = DomUtils.readAttribute(node, "itemid", null);
670
671 final List<ItemProp> itemProps = getItemProps(node, true);
672 final String[] itemrefIDs = itemrefIds(node);
673 final ItemProp[] deferredProperties;
674 try {
675 deferredProperties = deferProperties(itemrefIDs);
676 } catch (MicrodataParserException mpe) {
677 mpe.setErrorNode(node);
678 throw mpe;
679 }
680 for (ItemProp deferredProperty : deferredProperties) {
681 if (itemProps.contains(deferredProperty)) {
682 manageError(new MicrodataParserException(
683 String.format(Locale.ROOT, "Duplicated deferred itemProp '%s'.", deferredProperty.getName()),
684 node));
685 continue;
686 }
687 itemProps.add(deferredProperty);
688 }
689
690 List<IRI> types;
691 if (itemType == null) {
692 types = Collections.emptyList();
693 } else {
694 types = new ArrayList<>();
695 boolean canConcatWithPrev = false;
696 for (String s : itemType.trim().split("\\s+")) {
697 try {
698 canConcatWithPrev = types.addAll(ItemScope.stringToSingletonIRI(s));
699 } catch (RuntimeException e) {
700 if (canConcatWithPrev) {
701 int lastInd = types.size() - 1;
702 try {
703 List<IRI> secondTry = ItemScope
704 .stringToSingletonIRI(types.get(lastInd).stringValue() + " " + s);
705 types.remove(lastInd);
706 canConcatWithPrev = types.addAll(secondTry);
707 } catch (RuntimeException e2) {
708 manageError(new MicrodataParserException(e.getMessage(), node));
709 canConcatWithPrev = false;
710 }
711 } else {
712 manageError(new MicrodataParserException(e.getMessage(), node));
713 }
714 }
715 }
716 }
717
718 final ItemScope newItemScope = new ItemScope(DomUtils.getXPathForNode(node),
719 itemProps.toArray(new ItemProp[itemProps.size()]), id, itemrefIDs, types, itemId);
720 itemScopes.put(node, newItemScope);
721 return newItemScope;
722 }
723
724 private void manageError(MicrodataParserException mpe) throws MicrodataParserException {
725 switch (errorMode) {
726 case FULL_REPORT:
727 errors.add(mpe);
728 break;
729 case STOP_AT_FIRST_ERROR:
730 throw mpe;
731 default:
732 throw new IllegalStateException("Unsupported mode " + errorMode);
733 }
734 }
735
736 }