1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html.microformats2;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.TagSoupExtractionResult;
24 import org.apache.any23.extractor.html.microformats2.annotations.Includes;
25 import org.apache.any23.vocab.VCard;
26 import org.eclipse.rdf4j.model.BNode;
27 import org.eclipse.rdf4j.model.Resource;
28 import org.eclipse.rdf4j.model.vocabulary.RDF;
29 import org.w3c.dom.Node;
30 import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
31 import org.apache.any23.extractor.html.HTMLDocument;
32
33
34
35
36
37
38 @Includes(extractors = HGeoExtractor.class)
39 public class HAdrExtractor extends EntityBasedMicroformatExtractor {
40
41 private static final VCard vVCARD = VCard.getInstance();
42
43 private static final String[] addressFields = { "street-address", "extended-address", "locality", "region",
44 "postal-code", "country-name", "geo" };
45
46 private static final String[] geoFields = { "latitude", "longitude", "altitude" };
47
48 protected String getBaseClassName() {
49 return Microformats2Prefixes.CLASS_PREFIX + "adr";
50 }
51
52 @Override
53 protected void resetExtractor() {
54
55 }
56
57 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
58 if (null == node)
59 return false;
60 final HTMLDocument document = new HTMLDocument(node);
61 BNode adr = getBlankNodeFor(node);
62 out.writeTriple(adr, RDF.TYPE, vVCARD.Address);
63 final String extractorName = getDescription().getExtractorName();
64 for (String field : addressFields) {
65 HTMLDocument.TextField[] values = document
66 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
67 for (HTMLDocument.TextField val : values) {
68 if (!field.equals("geo")) {
69 conditionallyAddStringProperty(val.source(), adr, vVCARD.getProperty(field), val.value());
70 } else {
71 String[] composed = val.value().split(";");
72 for (int counter = 0; counter < composed.length; counter++) {
73 conditionallyAddStringProperty(val.source(), adr, vVCARD.getProperty(geoFields[counter]),
74 composed[counter]);
75
76 }
77 }
78 }
79 }
80 addGeoAsUrlResource(adr, document);
81 final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
82 tser.addResourceRoot(document.getPathToLocalRoot(), adr, this.getClass());
83 return true;
84 }
85
86 private void addGeoAsUrlResource(Resource card, HTMLDocument document) throws ExtractionException {
87 HTMLDocument.TextField[] links = document.getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + "geo");
88 for (HTMLDocument.TextField link : links) {
89 conditionallyAddResourceProperty(card, vVCARD.geo, getHTMLDocument().resolveIRI(link.value()));
90 }
91 }
92
93 @Override
94 public ExtractorDescription getDescription() {
95 return HAdrExtractorFactory.getDescriptionInstance();
96 }
97
98 }