1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.w3c.dom.Document;
21 import org.w3c.dom.NamedNodeMap;
22 import org.w3c.dom.Node;
23 import org.w3c.dom.NodeList;
24 import org.w3c.dom.traversal.DocumentTraversal;
25 import org.w3c.dom.traversal.NodeFilter;
26 import org.w3c.dom.traversal.NodeIterator;
27
28 import javax.xml.transform.OutputKeys;
29 import javax.xml.transform.Result;
30 import javax.xml.transform.Transformer;
31 import javax.xml.transform.TransformerConfigurationException;
32 import javax.xml.transform.TransformerException;
33 import javax.xml.transform.TransformerFactory;
34 import javax.xml.transform.TransformerFactoryConfigurationError;
35 import javax.xml.transform.dom.DOMSource;
36 import javax.xml.transform.stream.StreamResult;
37 import javax.xml.xpath.XPath;
38 import javax.xml.xpath.XPathConstants;
39 import javax.xml.xpath.XPathExpressionException;
40 import javax.xml.xpath.XPathFactory;
41
42 import java.io.ByteArrayInputStream;
43 import java.io.ByteArrayOutputStream;
44 import java.io.IOException;
45 import java.io.InputStream;
46 import java.io.StringWriter;
47 import java.io.UnsupportedEncodingException;
48 import java.util.ArrayList;
49 import java.util.List;
50 import java.util.Locale;
51 import java.util.regex.Pattern;
52
53
54
55
56
57
58
59
60
61 public class DomUtils {
62
63 private static final String[] EMPTY_STRING_ARRAY = new String[0];
64
65 private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
66
67 private DomUtils() {
68 }
69
70
71
72
73
74
75
76
77
78
79 public static int getIndexInParent(Node n) {
80 Node parent = n.getParentNode();
81 if (parent == null) {
82 return 0;
83 }
84 NodeList nodes = parent.getChildNodes();
85 int counter = -1;
86 for (int i = 0; i < nodes.getLength(); i++) {
87 Node current = nodes.item(i);
88 if (current.getNodeType() == n.getNodeType() && current.getNodeName().equals(n.getNodeName())) {
89 counter++;
90 }
91 if (current.equals(n)) {
92 return counter;
93 }
94 }
95 throw new IllegalStateException("Cannot find a child within its parent node list.");
96 }
97
98
99
100
101
102
103
104
105
106
107 public static String getXPathForNode(Node node) {
108 final StringBuilder sb = new StringBuilder();
109 Node parent = node;
110 while (parent != null && parent.getNodeType() != Node.DOCUMENT_NODE) {
111 sb.insert(0, "]");
112 sb.insert(0, getIndexInParent(parent) + 1);
113 sb.insert(0, "[");
114 sb.insert(0, parent.getNodeName());
115 sb.insert(0, "/");
116 parent = parent.getParentNode();
117 }
118 return sb.toString();
119 }
120
121
122
123
124
125
126
127
128
129 public static String[] getXPathListForNode(Node n) {
130 if (n == null) {
131 return EMPTY_STRING_ARRAY;
132 }
133 List<String> ancestors = new ArrayList<String>();
134 ancestors.add(String.format(Locale.ROOT, "%s[%s]", n.getNodeName(), getIndexInParent(n)));
135 Node parent = n.getParentNode();
136 while (parent != null) {
137 ancestors.add(0, String.format(Locale.ROOT, "%s[%s]", parent.getNodeName(), getIndexInParent(parent)));
138 parent = parent.getParentNode();
139 }
140 return ancestors.toArray(new String[ancestors.size()]);
141 }
142
143
144
145
146
147
148
149
150
151
152
153 public static int[] getNodeLocation(Node n) {
154 if (n == null)
155 throw new NullPointerException("node cannot be null.");
156 final TagSoupParser.ElementLocation elementLocation = (TagSoupParser.ElementLocation) n
157 .getUserData(TagSoupParser.ELEMENT_LOCATION);
158 if (elementLocation == null)
159 return null;
160 return new int[] { elementLocation.getBeginLineNumber(), elementLocation.getBeginColumnNumber(),
161 elementLocation.getEndLineNumber(), elementLocation.getEndColumnNumber() };
162 }
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177 public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling, boolean strict) {
178 if (candidateAncestor == null)
179 throw new NullPointerException("candidate ancestor cannot be null null.");
180 if (candidateSibling == null)
181 throw new NullPointerException("candidate sibling cannot be null null.");
182 if (strict && candidateAncestor.equals(candidateSibling))
183 return false;
184 Node parent = candidateSibling;
185 while (parent != null) {
186 if (parent.equals(candidateAncestor))
187 return true;
188 parent = parent.getParentNode();
189 }
190 return false;
191 }
192
193
194
195
196
197
198
199
200
201
202
203
204
205 public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling) {
206 return isAncestorOf(candidateAncestor, candidateSibling, false);
207 }
208
209
210
211
212
213
214
215
216
217
218
219
220 public static List<Node> findAllByClassName(Node root, String className) {
221 return findAllBy(root, null, "class", className.toLowerCase(Locale.ROOT));
222 }
223
224
225
226
227
228
229
230
231
232
233
234
235 public static List<Node> findAllByAttributeName(Node root, String attrName) {
236 return findAllBy(root, null, attrName, null);
237 }
238
239 public static List<Node> findAllByAttributeContains(Node node, String attrName, String attrContains) {
240 return findAllBy(node, null, attrName, attrContains);
241 }
242
243 public static List<Node> findAllByTag(Node root, String tagName) {
244 return findAllBy(root, tagName, null, null);
245 }
246
247 public static List<Node> findAllByTagAndClassName(Node root, final String tagName, final String className) {
248 return findAllBy(root, tagName, "class", className);
249 }
250
251
252
253
254
255
256
257
258
259
260
261 public static Node findNodeById(Node root, String id) {
262 Node node;
263 try {
264 String xpath = "//*[@id='" + id + "']";
265 node = (Node) xPathEngine.evaluate(xpath, root, XPathConstants.NODE);
266 } catch (XPathExpressionException ex) {
267 throw new RuntimeException("Should not happen", ex);
268 }
269 return node;
270 }
271
272
273
274
275
276
277
278
279
280
281
282 public static List<Node> findAll(Node node, String xpath) {
283 if (node == null) {
284 throw new NullPointerException("node cannot be null.");
285 }
286 try {
287 NodeList nodes = (NodeList) xPathEngine.evaluate(xpath, node, XPathConstants.NODESET);
288 List<Node> result = new ArrayList<Node>(nodes.getLength());
289 for (int i = 0; i < nodes.getLength(); i++) {
290 result.add(nodes.item(i));
291 }
292 return result;
293 } catch (XPathExpressionException ex) {
294 throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
295 }
296 }
297
298
299
300
301
302
303
304
305
306
307
308 public static String find(Node node, String xpath) {
309 try {
310 String val = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
311 if (null == val)
312 return "";
313 return val;
314 } catch (XPathExpressionException ex) {
315 throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
316 }
317 }
318
319
320
321
322
323
324
325
326
327
328
329
330 public static boolean hasClassName(Node node, String className) {
331 return hasAttribute(node, "class", className);
332 }
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347 public static boolean hasAttribute(Node node, String attributeName, String className) {
348
349
350 String attr = readAttribute(node, attributeName);
351 for (String c : attr.split("\\s+"))
352 if (c.equalsIgnoreCase(className))
353 return true;
354 return false;
355 }
356
357
358
359
360
361
362
363
364
365
366
367 public static boolean hasAttribute(Node node, String attributeName) {
368 return readAttribute(node, attributeName, null) != null;
369 }
370
371
372
373
374
375
376
377
378
379 public static boolean isElementNode(Node target) {
380 return Node.ELEMENT_NODE == target.getNodeType();
381 }
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396 public static String readAttribute(Node node, String attribute, String defaultValue) {
397 NamedNodeMap attributes = node.getAttributes();
398 if (null == attributes)
399 return defaultValue;
400 Node attr = attributes.getNamedItem(attribute);
401 if (null == attr)
402 return defaultValue;
403 return attr.getNodeValue();
404 }
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419 public static String readAttributeWithPrefix(Node node, String attributePrefix, String defaultValue) {
420 final NamedNodeMap attributes = node.getAttributes();
421 if (null == attributes) {
422 return defaultValue;
423 }
424 Node attribute;
425 for (int a = 0; a < attributes.getLength(); a++) {
426 attribute = attributes.item(a);
427 if (attribute.getNodeName().startsWith(attributePrefix)) {
428 return attribute.getNodeValue();
429 }
430 }
431 return defaultValue;
432 }
433
434
435
436
437
438
439
440
441
442
443
444 public static String readAttribute(Node node, String attribute) {
445 return readAttribute(node, attribute, "");
446 }
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463 public static String serializeToXML(Node node, boolean indent) throws TransformerException, IOException {
464 final DOMSource domSource = new DOMSource(node);
465 final Transformer transformer = TransformerFactory.newInstance().newTransformer();
466 transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
467 transformer.setOutputProperty(OutputKeys.METHOD, "xml");
468 transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
469 if (indent) {
470 transformer.setOutputProperty(OutputKeys.INDENT, "yes");
471 transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
472 }
473 final StringWriter sw = new StringWriter();
474 final StreamResult sr = new StreamResult(sw);
475 transformer.transform(domSource, sr);
476 sw.close();
477 return sw.toString();
478 }
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494 private static List<Node> findAllBy(Node root, final String tagName, final String attrName, String attrContains) {
495 DocumentTraversal documentTraversal = (DocumentTraversal) root.getOwnerDocument();
496 if (documentTraversal == null) {
497 documentTraversal = (DocumentTraversal) root;
498 }
499
500 final Pattern attrContainsPattern;
501 if (attrContains != null && !attrContains.equals("*")) {
502 attrContainsPattern = Pattern.compile("(^|\\s)" + attrContains + "(\\s|$)", Pattern.CASE_INSENSITIVE);
503 } else {
504 attrContainsPattern = null;
505 }
506
507 final List<Node> result = new ArrayList<Node>();
508 NodeIterator nodeIterator = documentTraversal.createNodeIterator(root, NodeFilter.SHOW_ELEMENT,
509 new NodeFilter() {
510 @Override
511 public short acceptNode(Node node) {
512 if (node.getNodeType() == Node.ELEMENT_NODE) {
513 if (tagName != null && !tagName.equals("*") && !tagName.equals(node.getNodeName())) {
514
515 return FILTER_ACCEPT;
516 }
517
518 if (attrName != null) {
519 Node attrNameNode = node.getAttributes().getNamedItem(attrName);
520 if (attrNameNode == null) {
521
522 return FILTER_ACCEPT;
523 }
524
525 if (attrContainsPattern != null
526 && !attrContainsPattern.matcher(attrNameNode.getNodeValue()).find()) {
527
528 return FILTER_ACCEPT;
529 }
530 }
531 result.add(node);
532 }
533 return FILTER_ACCEPT;
534 }
535 }, false);
536
537
538 while (nodeIterator.nextNode() != null)
539 ;
540
541
542 nodeIterator.detach();
543
544 return result;
545 }
546
547
548
549
550
551
552
553
554
555 public static InputStream documentToInputStream(Document doc) {
556 DOMSource source = new DOMSource(doc);
557 StringWriter xmlAsWriter = new StringWriter();
558 StreamResult result = new StreamResult(xmlAsWriter);
559 try {
560 TransformerFactory.newInstance().newTransformer().transform(source, result);
561 } catch (TransformerConfigurationException e) {
562 throw new RuntimeException("Error within Document to InputStream transformation configuration!");
563 } catch (TransformerException e) {
564 throw new RuntimeException("Error whilst transforming the Document to InputStream!");
565 } catch (TransformerFactoryConfigurationError e) {
566 throw new RuntimeException("Error within Document to InputStream transformation configuration factory!");
567 }
568
569 InputStream is = null;
570 try {
571 is = new ByteArrayInputStream(xmlAsWriter.toString().getBytes("UTF-8"));
572 } catch (UnsupportedEncodingException e) {
573 throw new RuntimeException("Error obtaining data with \"UTF-8\" encoding!", e);
574 }
575 return is;
576 }
577
578
579
580
581
582
583
584
585
586 public static InputStream nodeToInputStream(Node node) {
587 ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
588 Result outputTarget = new StreamResult(outputStream);
589 Transformer t = null;
590 try {
591 t = TransformerFactory.newInstance().newTransformer();
592 } catch (TransformerConfigurationException e) {
593 throw new RuntimeException("Serious configuration error.", e);
594 } catch (TransformerFactoryConfigurationError e) {
595 throw new RuntimeException("Serious configuration error.", e);
596 }
597 t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
598 try {
599 t.transform(new DOMSource(node), outputTarget);
600 } catch (TransformerException e) {
601 throw new RuntimeException("Error whilst transforming the Node to InputStream!");
602 }
603 return new ByteArrayInputStream(outputStream.toByteArray());
604 }
605
606 }