1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import java.util.Locale;
21
22 import org.apache.any23.extractor.ExtractionException;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.ExtractorDescription;
25 import org.apache.any23.extractor.TagSoupExtractionResult;
26 import org.apache.any23.vocab.WO;
27 import org.eclipse.rdf4j.model.BNode;
28 import org.eclipse.rdf4j.model.Resource;
29 import org.eclipse.rdf4j.model.IRI;
30 import org.eclipse.rdf4j.model.vocabulary.RDF;
31 import org.w3c.dom.Node;
32
33
34
35
36
37
38
39
40
41 public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
42
43 private static final WO vWO = WO.getInstance();
44
45 private static final String[] classes = { "kingdom", "phylum", "order", "family", "genus", "species", "class", };
46
47
48
49
50
51
52 @Override
53 public ExtractorDescription getDescription() {
54 return SpeciesExtractorFactory.getDescriptionInstance();
55 }
56
57
58
59
60
61
62 @Override
63 protected String getBaseClassName() {
64 return "biota";
65 }
66
67
68
69
70 @Override
71 protected void resetExtractor() {
72
73 }
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89 @Override
90 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
91 BNode biota = getBlankNodeFor(node);
92 conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
93
94 final HTMLDocument fragment = new HTMLDocument(node);
95 addNames(fragment, biota);
96 addClasses(fragment, biota);
97
98 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
99 tser.addResourceRoot(DomUtils.getXPathListForNode(node), biota, this.getClass());
100
101 return true;
102 }
103
104 private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
105 HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
106 conditionallyAddStringProperty(binomial.source(), biota, vWO.scientificName, binomial.value());
107 HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
108 conditionallyAddStringProperty(vernacular.source(), biota, vWO.speciesName, vernacular.value());
109 }
110
111 private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
112 for (String clazz : classes) {
113 HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
114 conditionallyAddStringProperty(classTextField.source(), biota, resolvePropertyName(clazz),
115 classTextField.value());
116 }
117 }
118
119 private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
120 for (String clazz : classes) {
121 HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
122 if (classTextField.source() != null) {
123 BNode classBNode = getBlankNodeFor(classTextField.source());
124 addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
125 conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
126 HTMLDocument fragment = new HTMLDocument(classTextField.source());
127 addClassesName(fragment, classBNode);
128 }
129 }
130 }
131
132 private IRI resolvePropertyName(String clazz) {
133 return vWO.getProperty(String.format(Locale.ROOT, "%sName", clazz));
134 }
135
136 private IRI resolveClassName(String clazz) {
137 String upperCaseClass = clazz.substring(0, 1);
138 return vWO.getClass(
139 String.format(Locale.ROOT, "%s%s", upperCaseClass.toUpperCase(Locale.ROOT), clazz.substring(1)));
140 }
141 }