1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
22 import org.apache.any23.rdf.RDFUtils;
23 import org.eclipse.rdf4j.model.IRI;
24 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
25 import org.slf4j.Logger;
26 import org.slf4j.LoggerFactory;
27 import org.w3c.dom.Document;
28 import org.w3c.dom.NamedNodeMap;
29 import org.w3c.dom.Node;
30 import org.w3c.dom.NodeList;
31 import org.w3c.dom.Text;
32
33 import javax.xml.xpath.XPath;
34 import javax.xml.xpath.XPathConstants;
35 import javax.xml.xpath.XPathExpressionException;
36 import javax.xml.xpath.XPathFactory;
37 import java.net.URISyntaxException;
38 import java.util.ArrayList;
39 import java.util.List;
40
41
42
43
44
45
46
47
48 public class HTMLDocument {
49
50 private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
51 private final static Logger log = LoggerFactory.getLogger(HTMLDocument.class);
52
53 private Node document;
54 private java.net.URI baseIRI;
55
56 private final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
57 SimpleValueFactory.getInstance());
58
59
60
61
62
63
64
65
66
67 public static TextField readTextField(Node node) {
68 TextField result;
69 final String name = node.getNodeName();
70 final NamedNodeMap attributes = node.getAttributes();
71
72 if (attributes == null) {
73 return new TextField(node.getTextContent(), node);
74 }
75
76 List<Node> values = DomUtils.findAllByClassName(node, "value");
77 if (!values.isEmpty()) {
78 StringBuilder val = new StringBuilder();
79 for (Node n : values)
80 val.append(n.getTextContent());
81 return new TextField(val.toString().trim(), node);
82 }
83 if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) {
84 result = new TextField(attributes.getNamedItem("title").getNodeValue(), node);
85 } else if ("A".equals(name)) {
86 if (DomUtils.hasAttribute(node, "rel", "tag")) {
87 String href = extractRelTag(attributes);
88 result = new TextField(href, node);
89 } else
90 result = new TextField(node.getTextContent(), node);
91 } else if (("IMG".equals(name) || "AREA".equals(name)) && (null != attributes.getNamedItem("alt"))) {
92 result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node);
93 } else {
94 result = new TextField(node.getTextContent(), node);
95 }
96 return result;
97 }
98
99
100
101
102
103
104
105
106
107 public static void readUrlField(List<TextField> res, Node node) {
108 String name = node.getNodeName();
109 NamedNodeMap attributes = node.getAttributes();
110 if (null == attributes) {
111 res.add(new TextField(node.getTextContent(), node));
112 return;
113 }
114 if ("A".equals(name) || "AREA".equals(name)) {
115 Node n = attributes.getNamedItem("href");
116 if (n != null) {
117 res.add(new TextField(n.getNodeValue(), n));
118 }
119 } else if ("ABBR".equals(name)) {
120 Node n = attributes.getNamedItem("title");
121 if (n != null) {
122 res.add(new TextField(n.getNodeValue(), n));
123 }
124 } else if ("IMG".equals(name)) {
125 Node n = attributes.getNamedItem("src");
126 if (n != null) {
127 res.add(new TextField(n.getNodeValue(), n));
128 } else {
129 n = attributes.getNamedItem("srcset");
130 if (n != null) {
131 res.add(new TextField(n.getNodeValue().split("[\\s,]+")[0], n));
132 }
133 }
134 } else if ("OBJECT".equals(name)) {
135 Node n = attributes.getNamedItem("data");
136 if (n != null) {
137 res.add(new TextField(n.getNodeValue(), n));
138 }
139 } else {
140 res.add(new TextField(extractHCardTextContent(node), node));
141 }
142 }
143
144 private static String extractHCardTextContent(Node node) {
145 StringBuilder sb = new StringBuilder();
146 NodeList nodes = node.getChildNodes();
147
148 if (extractTextInValue(nodes, sb) == 0) {
149
150 extractTextNotInType(nodes, sb);
151 }
152 return sb.toString();
153 }
154
155 private static int extractTextInValue(NodeList nodes, StringBuilder b) {
156 int count = 0;
157 for (int i = 0, len = nodes.getLength(); i < len; i++) {
158 Node n = nodes.item(i);
159 if (DomUtils.hasClassName(n, "value")) {
160 count++;
161 b.append(n.getTextContent().trim());
162 } else {
163 count += extractTextInValue(n.getChildNodes(), b);
164 }
165 }
166 return count;
167 }
168
169 private static void extractTextNotInType(NodeList nodes, StringBuilder b) {
170 for (int i = 0, len = nodes.getLength(); i < len; i++) {
171 Node n = nodes.item(i);
172 if (n.getNodeType() == Node.TEXT_NODE) {
173 b.append(n.getNodeValue().trim());
174 } else if (!DomUtils.hasClassName(n, "type")) {
175 extractTextNotInType(n.getChildNodes(), b);
176 }
177 }
178 }
179
180
181
182
183
184
185
186
187
188
189 public static String extractRelTag(String hrefAttributeContent) {
190 String[] all = hrefAttributeContent.split("[#?]");
191
192 String path = all[0];
193 int pathLenghtMin1 = path.length() - 1;
194 if ('/' == path.charAt(pathLenghtMin1)) {
195 path = path.substring(0, pathLenghtMin1);
196 }
197 return path;
198 }
199
200
201
202
203
204
205
206
207
208
209 public static String extractRelTag(NamedNodeMap attributes) {
210 return extractRelTag(attributes.getNamedItem("href").getNodeValue());
211 }
212
213
214
215
216
217
218
219
220
221
222
223
224 public static String readNodeContent(Node node, boolean prettify) {
225 final String content = node.getTextContent();
226 return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content;
227 }
228
229
230
231
232
233
234
235 public HTMLDocument(Node document) {
236 if (null == document)
237 throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument");
238 this.document = document;
239 }
240
241
242
243
244
245
246
247
248
249
250 public IRI resolveIRI(String uri) throws ExtractionException {
251 return valueFactory.resolveIRI(uri, getBaseIRI());
252 }
253
254 public String find(String xpath) {
255 return DomUtils.find(getDocument(), xpath);
256 }
257
258 public Node findNodeById(String id) {
259 return DomUtils.findNodeById(getDocument(), id);
260 }
261
262 public List<Node> findAll(String xpath) {
263 return DomUtils.findAll(getDocument(), xpath);
264 }
265
266 public String findMicroformattedValue(String objectTag, String object, String fieldTag, String field, String key) {
267 Node node = findMicroformattedObjectNode(objectTag, object);
268 if (null == node)
269 return "";
270
271 if (DomUtils.hasClassName(node, field))
272 return node.getTextContent();
273
274
275 try {
276 String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key;
277 String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
278 if (null == value) {
279 return "";
280 }
281 return value;
282 } catch (XPathExpressionException ex) {
283 throw new RuntimeException("Should not happen, XPath expression is built locally", ex);
284 }
285
286 }
287
288 public Node getDocument() {
289 return document;
290 }
291
292
293
294
295
296
297
298
299
300
301 public TextField getSingularTextField(String className) {
302 TextField[] res = getPluralTextField(className);
303 if (res.length == 0)
304 return new TextField("", null);
305 return res[0];
306 }
307
308
309
310
311
312
313
314
315
316 public TextField[] getPluralTextField(String className) {
317 List<TextField> res = new ArrayList<TextField>();
318 List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
319 for (Node node : nodes) {
320 res.add(readTextField(node));
321 }
322 return res.toArray(new TextField[res.size()]);
323 }
324
325
326
327
328
329
330
331
332
333
334 public TextField getSingularUrlField(String className) {
335 TextField[] res = getPluralUrlField(className);
336 if (res.length < 1)
337 return new TextField("", null);
338 return res[0];
339 }
340
341
342
343
344
345
346
347
348
349 public TextField[] getPluralUrlField(String className) {
350 List<TextField> res = new ArrayList<TextField>();
351 List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
352 for (Node node : nodes)
353 readUrlField(res, node);
354 return res.toArray(new TextField[res.size()]);
355 }
356
357 public Node findMicroformattedObjectNode(String objectTag, String name) {
358 List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name);
359 if (nodes.isEmpty())
360 return null;
361 return nodes.get(0);
362 }
363
364
365
366
367
368
369
370
371
372 public String readAttribute(String attribute) {
373 return DomUtils.readAttribute(getDocument(), attribute);
374 }
375
376
377
378
379
380
381
382
383
384 public List<Node> findAllByClassName(String clazz) {
385 return DomUtils.findAllByClassName(getDocument(), clazz);
386 }
387
388
389
390
391
392
393 public String getText() {
394 NodeList children = getDocument().getChildNodes();
395 if (children.getLength() == 1 && children.item(0) instanceof Text) {
396 return children.item(0).getTextContent();
397 }
398 return null;
399 }
400
401
402
403
404
405
406 public String getDefaultLanguage() {
407 final String xpathLanguageSelector = "/HTML";
408 Node html;
409 try {
410 html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE);
411 } catch (XPathExpressionException xpeee) {
412 throw new IllegalStateException();
413 }
414 if (html == null) {
415 return null;
416 }
417 Node langAttribute = html.getAttributes().getNamedItem("xml:lang");
418 return langAttribute == null ? null : langAttribute.getTextContent();
419 }
420
421
422
423
424
425
426 public String[] getPathToLocalRoot() {
427 return DomUtils.getXPathListForNode(document);
428 }
429
430
431
432
433
434
435 public TextField[] extractRelTagNodes() {
436 final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel");
437 final List<TextField> result = new ArrayList<TextField>();
438 for (Node relTagNode : relTagNodes) {
439 readUrlField(result, relTagNode);
440 }
441 return result.toArray(new TextField[result.size()]);
442 }
443
444 private java.net.URI getBaseIRI() throws ExtractionException {
445 if (baseIRI == null) {
446
447
448
449
450
451
452
453
454 Document doc = document instanceof Document ? (Document) document : document.getOwnerDocument();
455
456 if (doc == null) {
457 throw new ExtractionException(
458 "Node " + document.getNodeName() + " was not associated with a document.");
459 }
460
461 String uri = doc.getDocumentURI();
462
463 if (uri == null) {
464 throw new ExtractionException("document URI is null, this should not happen");
465 }
466
467 try {
468 baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(uri));
469 } catch (IllegalArgumentException ex) {
470 throw new ExtractionException("Error in base IRI: " + uri, ex);
471 } catch (URISyntaxException ex) {
472 throw new ExtractionException("Error in base IRI: " + uri, ex);
473 }
474 }
475 return baseIRI;
476 }
477
478
479
480
481
482 public static class TextField {
483 private String value;
484 private Node source;
485
486 public TextField(String value, Node source) {
487 this.value = value;
488 this.source = source;
489 }
490
491 public String value() {
492 return value;
493 }
494
495 public Node source() {
496 return source;
497 }
498 }
499
500 }