1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html.microformats2;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.TagSoupExtractionResult;
24 import org.apache.any23.extractor.html.HTMLDocument;
25 import org.apache.any23.vocab.HCard;
26 import org.eclipse.rdf4j.model.BNode;
27 import org.eclipse.rdf4j.model.Resource;
28 import org.eclipse.rdf4j.model.IRI;
29 import org.eclipse.rdf4j.model.vocabulary.RDF;
30 import org.w3c.dom.Node;
31 import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
32 import org.apache.any23.extractor.html.DomUtils;
33
34 import java.util.List;
35
36
37
38
39
40
41 public class HCardExtractor extends EntityBasedMicroformatExtractor {
42
43 private static final HCard vCARD = HCard.getInstance();
44
45 private static final String[] cardFields = { "name", "honorific-prefix", "given-name", "additional-name",
46 "family-name", "sort-string", "honorific-suffix", "nickname", "email", "logo", "photo", "url", "uid",
47 "category", "tel", "note", "bday", "key", "org", "job-title", "role", "impp", "sex", "gender-identity",
48 "anniversary", "adr", "geo" };
49
50 private static final String[] addressFields = { "street-address", "extended-address", "locality", "region",
51 "postal-code", "country-name", "geo" };
52
53 private static final String[] geoFields = { "latitude", "longitude", "altitude" };
54
55 @Override
56 public ExtractorDescription getDescription() {
57 return HCardExtractorFactory.getDescriptionInstance();
58 }
59
60 @Override
61 protected String getBaseClassName() {
62 return Microformats2Prefixes.CLASS_PREFIX + "card";
63 }
64
65 @Override
66 protected void resetExtractor() {
67
68 }
69
70 @Override
71 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
72 final BNode card = getBlankNodeFor(node);
73 conditionallyAddResourceProperty(card, RDF.TYPE, vCARD.Card);
74 final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
75 addName(fragment, card);
76 addHonorificPrefix(fragment, card);
77 addGivenName(fragment, card);
78 addAdditionalName(fragment, card);
79 addFamilyName(fragment, card);
80 addSortString(fragment, card);
81 addHonorificSuffix(fragment, card);
82 addNickname(fragment, card);
83 addEmails(fragment, card);
84 addLogo(fragment, card);
85 addPhoto(fragment, card);
86 addURLs(fragment, card);
87 addUID(fragment, card);
88 addCategories(fragment, card);
89 addTelephones(fragment, card);
90 addNotes(fragment, card);
91 addBday(fragment, card);
92 addKey(fragment, card);
93 addOrg(fragment, card);
94 addJobTitle(fragment, card);
95 addRole(fragment, card);
96 addImpp(fragment, card);
97 addSex(fragment, card);
98 addGenderIdentity(fragment, card);
99 addAnniversary(fragment, card);
100 addGeo(fragment, card);
101 addAdr(fragment, card);
102 final TagSoupExtractionResult../org/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
103 tser.addResourceRoot(DomUtils.getXPathListForNode(node), card, this.getClass());
104 return true;
105 }
106
107 public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, BNode card, ExtractionResult out)
108 throws ExtractionException {
109 this.setCurrentExtractionResult(out);
110 addName(fragment, card);
111 addHonorificPrefix(fragment, card);
112 addGivenName(fragment, card);
113 addAdditionalName(fragment, card);
114 addFamilyName(fragment, card);
115 addSortString(fragment, card);
116 addHonorificSuffix(fragment, card);
117 addNickname(fragment, card);
118 addEmails(fragment, card);
119 addLogo(fragment, card);
120 addPhoto(fragment, card);
121 addURLs(fragment, card);
122 addUID(fragment, card);
123 addCategories(fragment, card);
124 addTelephones(fragment, card);
125 addNotes(fragment, card);
126 addBday(fragment, card);
127 addKey(fragment, card);
128 addOrg(fragment, card);
129 addJobTitle(fragment, card);
130 addRole(fragment, card);
131 addImpp(fragment, card);
132 addSex(fragment, card);
133 addGenderIdentity(fragment, card);
134 addAnniversary(fragment, card);
135 addGeo(fragment, card);
136 addAdr(fragment, card);
137 return card;
138 }
139
140 private void mapFieldWithProperty(HTMLDocument fragment, BNode card, String fieldClass, IRI property) {
141 HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
142 conditionallyAddStringProperty(title.source(), card, property, title.value());
143 }
144
145 private void addName(HTMLDocument fragment, BNode card) {
146 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[0], vCARD.name);
147 }
148
149 private void addHonorificPrefix(HTMLDocument fragment, BNode card) {
150 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[1],
151 vCARD.honorific_prefix);
152 }
153
154 private void addGivenName(HTMLDocument fragment, BNode card) {
155 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[2], vCARD.given_name);
156 }
157
158 private void addAdditionalName(HTMLDocument fragment, BNode card) {
159 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[3],
160 vCARD.additional_name);
161 }
162
163 private void addFamilyName(HTMLDocument fragment, BNode card) {
164 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[4], vCARD.family_name);
165 }
166
167 private void addSortString(HTMLDocument fragment, BNode card) {
168 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[5], vCARD.sort_string);
169 }
170
171 private void addHonorificSuffix(HTMLDocument fragment, BNode card) {
172 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[6],
173 vCARD.honorific_suffix);
174 }
175
176 private void addNickname(HTMLDocument fragment, BNode card) {
177 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[7], vCARD.nickname);
178 }
179
180 private void addEmails(HTMLDocument fragment, BNode card) throws ExtractionException {
181 final HTMLDocument.TextField[] emails = fragment
182 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[8]);
183 for (HTMLDocument.TextField email : emails) {
184 addIRIProperty(card, vCARD.email, fragment.resolveIRI(email.value()));
185
186 }
187 }
188
189 private void addLogo(HTMLDocument fragment, BNode card) throws ExtractionException {
190 final HTMLDocument.TextField logo = fragment
191 .getSingularUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[9]);
192 if (logo.source() == null)
193 return;
194 addIRIProperty(card, vCARD.logo, fragment.resolveIRI(logo.value()));
195 }
196
197 private void addPhoto(HTMLDocument fragment, BNode card) throws ExtractionException {
198 final HTMLDocument.TextField photo = fragment
199 .getSingularUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[10]);
200 if (photo.source() == null)
201 return;
202 addIRIProperty(card, vCARD.photo, fragment.resolveIRI(photo.value()));
203 }
204
205 private void addURLs(HTMLDocument fragment, BNode card) throws ExtractionException {
206 final HTMLDocument.TextField[] urls = fragment
207 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[11]);
208 for (HTMLDocument.TextField url : urls) {
209 addIRIProperty(card, vCARD.url, fragment.resolveIRI(url.value()));
210
211 }
212 }
213
214 private void addUID(HTMLDocument fragment, BNode card) throws ExtractionException {
215 final HTMLDocument.TextField uid = fragment
216 .getSingularUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[12]);
217 if (uid.source() == null)
218 return;
219 addIRIProperty(card, vCARD.uid, fragment.resolveIRI(uid.value()));
220 }
221
222 private void addCategories(HTMLDocument fragment, BNode entry) {
223 final HTMLDocument.TextField[] categories = fragment
224 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[13]);
225 for (HTMLDocument.TextField category : categories) {
226 conditionallyAddStringProperty(category.source(), entry, vCARD.category, category.value());
227 }
228 }
229
230 private void addTelephones(HTMLDocument fragment, BNode card) {
231 final HTMLDocument.TextField[] telephones = fragment
232 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[14]);
233 for (HTMLDocument.TextField tel : telephones) {
234 Node attribute = tel.source().getAttributes().getNamedItem("value");
235 if (attribute == null) {
236 conditionallyAddStringProperty(tel.source(), card, vCARD.tel, tel.value());
237 } else {
238 conditionallyAddStringProperty(tel.source(), card, vCARD.tel, attribute.getNodeValue());
239 }
240 }
241 }
242
243 private void addNotes(HTMLDocument fragment, BNode entry) {
244 final HTMLDocument.TextField[] categories = fragment
245 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[15]);
246 for (HTMLDocument.TextField category : categories) {
247 conditionallyAddStringProperty(category.source(), entry, vCARD.note, category.value());
248 }
249 }
250
251 private void addBday(HTMLDocument fragment, BNode card) {
252 final HTMLDocument.TextField bday = fragment
253 .getSingularTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + cardFields[16]);
254 if (bday.source() == null)
255 return;
256
257 Node attribute = bday.source().getAttributes().getNamedItem("datetime");
258 if (attribute == null) {
259 conditionallyAddStringProperty(bday.source(), card, vCARD.bday, bday.value());
260 } else {
261 conditionallyAddStringProperty(bday.source(), card, vCARD.bday, attribute.getNodeValue());
262
263 }
264 }
265
266 private void addKey(HTMLDocument fragment, BNode card) throws ExtractionException {
267 final HTMLDocument.TextField uid = fragment
268 .getSingularTextField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[17]);
269 if (uid.source() == null)
270 return;
271 addIRIProperty(card, vCARD.key, fragment.resolveIRI(uid.value()));
272 }
273
274 private void addOrg(HTMLDocument fragment, BNode card) {
275 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[18], vCARD.org);
276 }
277
278 private void addJobTitle(HTMLDocument fragment, BNode card) {
279 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[19], vCARD.job_title);
280 }
281
282 private void addRole(HTMLDocument fragment, BNode card) {
283 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[20], vCARD.role);
284 }
285
286 private void addImpp(HTMLDocument fragment, BNode card) throws ExtractionException {
287 final HTMLDocument.TextField impp = fragment
288 .getSingularTextField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[21]);
289 if (impp.source() == null)
290 return;
291 addIRIProperty(card, vCARD.impp, fragment.resolveIRI(impp.value()));
292 }
293
294 private void addSex(HTMLDocument fragment, BNode card) {
295 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[22], vCARD.sex);
296 }
297
298 private void addGenderIdentity(HTMLDocument fragment, BNode card) {
299 mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[23],
300 vCARD.gender_identity);
301 }
302
303 private void addAnniversary(HTMLDocument fragment, BNode card) {
304 final HTMLDocument.TextField anniversary = fragment
305 .getSingularTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + cardFields[24]);
306 if (anniversary.source() == null)
307 return;
308
309 Node attribute = anniversary.source().getAttributes().getNamedItem("datetime");
310 if (attribute == null) {
311 conditionallyAddStringProperty(anniversary.source(), card, vCARD.bday, anniversary.value());
312 } else {
313 conditionallyAddStringProperty(anniversary.source(), card, vCARD.bday, attribute.getNodeValue());
314
315 }
316 }
317
318 private void addAdr(HTMLDocument doc, Resource card) throws ExtractionException {
319 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[25]
320 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + cardFields[25]);
321 if (nodes.isEmpty())
322 return;
323 for (Node node : nodes) {
324 BNode location = valueFactory.createBNode();
325 addIRIProperty(location, RDF.TYPE, vCARD.Address);
326 HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
327 for (String field : addressFields) {
328 HTMLDocument.TextField[] values = fragment
329 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
330 for (HTMLDocument.TextField val : values) {
331 if (!field.equals("geo")) {
332 conditionallyAddStringProperty(val.source(), location, vCARD.getProperty(field), val.value());
333 } else {
334 addGeo(new HTMLDocument(node), card);
335 }
336 }
337 }
338 }
339 }
340
341 private void addGeo(HTMLDocument doc, Resource card) throws ExtractionException {
342 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[26]
343 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + cardFields[26]);
344 if (nodes.isEmpty())
345 return;
346 for (Node node : nodes) {
347 BNode location = valueFactory.createBNode();
348 addIRIProperty(location, RDF.TYPE, vCARD.Geo);
349 HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
350 for (String field : geoFields) {
351 HTMLDocument.TextField[] values = fragment
352 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
353 for (HTMLDocument.TextField val : values) {
354 Node attribute = val.source().getAttributes().getNamedItem("title");
355 if (attribute == null) {
356 conditionallyAddStringProperty(val.source(), location, vCARD.getProperty(field), val.value());
357 } else {
358 conditionallyAddStringProperty(val.source(), location, vCARD.getProperty(field),
359 attribute.getNodeValue());
360 }
361 }
362 }
363 }
364 }
365
366 }