1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionResult;
21 import org.apache.any23.extractor.ExtractorDescription;
22 import org.apache.any23.extractor.TagSoupExtractionResult;
23 import org.apache.any23.vocab.VCard;
24 import org.eclipse.rdf4j.model.BNode;
25 import org.eclipse.rdf4j.model.vocabulary.RDF;
26 import org.w3c.dom.Node;
27
28
29
30
31
32
33 public class AdrExtractor extends EntityBasedMicroformatExtractor {
34
35 private static final VCard vVCARD = VCard.getInstance();
36
37 private static final String[] addressFields = { "post-office-box", "extended-address", "street-address", "locality",
38 "region", "country-name", "postal-code" };
39
40 protected String getBaseClassName() {
41 return "adr";
42 }
43
44 @Override
45 protected void resetExtractor() {
46
47 }
48
49 protected boolean extractEntity(Node node, ExtractionResult out) {
50 if (null == node)
51 return false;
52
53 final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument document = new HTMLDocument(node);
54 BNode adr = getBlankNodeFor(node);
55 out.writeTriple(adr, RDF.TYPE, vVCARD.Address);
56 final String extractorName = getDescription().getExtractorName();
57 for (String field : addressFields) {
58 HTMLDocument.TextField[] values = document.getPluralTextField(field);
59 for (HTMLDocument.TextField val : values) {
60 conditionallyAddStringProperty(val.source(), adr, vVCARD.getProperty(field), val.value());
61 }
62 }
63 HTMLDocument.TextField[] types = document.getPluralTextField("type");
64 for (HTMLDocument.TextField val : types) {
65 conditionallyAddStringProperty(val.source(), adr, vVCARD.addressType, val.value());
66 }
67
68 final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
69 tser.addResourceRoot(document.getPathToLocalRoot(), adr, this.getClass());
70
71 return true;
72 }
73
74 @Override
75 public ExtractorDescription getDescription() {
76 return AdrExtractorFactory.getDescriptionInstance();
77 }
78
79 }