1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.IssueReport;
24 import org.apache.any23.extractor.TagSoupExtractionResult;
25 import org.apache.any23.extractor.html.annotations.Includes;
26 import org.apache.any23.vocab.VCard;
27 import org.apache.commons.lang3.StringUtils;
28 import org.eclipse.rdf4j.model.BNode;
29 import org.eclipse.rdf4j.model.Resource;
30 import org.eclipse.rdf4j.model.IRI;
31 import org.eclipse.rdf4j.model.vocabulary.RDF;
32 import org.w3c.dom.NamedNodeMap;
33 import org.w3c.dom.Node;
34
35 import java.util.ArrayList;
36 import java.util.Collection;
37 import java.util.List;
38
39 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
40
41
42
43
44
45
46 @Includes(extractors = AdrExtractor.class)
47 public class HCardExtractor extends EntityBasedMicroformatExtractor {
48
49 private static final VCard vCARD = VCard.getInstance();
50
51 private HCardName name = new HCardName();
52
53 private HTMLDocument fragment;
54
55 @Override
56 public ExtractorDescription getDescription() {
57 return HCardExtractorFactory.getDescriptionInstance();
58 }
59
60 @Override
61 protected String getBaseClassName() {
62 return "vcard";
63 }
64
65 @Override
66 protected void resetExtractor() {
67 name.reset();
68 }
69
70 private void fixIncludes(HTMLDocument document, Node node, IssueReport report) {
71 NamedNodeMap attributes = node.getAttributes();
72
73 if ("TD".equals(node.getNodeName()) && (null != attributes.getNamedItem("headers"))) {
74 String id = attributes.getNamedItem("headers").getNodeValue();
75 Node header = document.findNodeById(id);
76 if (null != header) {
77 node.appendChild(header.cloneNode(true));
78 attributes.removeNamedItem("headers");
79 }
80 }
81
82
83 for (Node current : DomUtils.findAllByAttributeName(document.getDocument(), "class")) {
84 if (!DomUtils.hasClassName(current, "include"))
85 continue;
86
87
88 current.getAttributes().removeNamedItem("class");
89 ArrayList<TextField> res = new ArrayList<TextField>();
90 HTMLDocument.readUrlField(res, current);
91 if (res.isEmpty())
92 continue;
93 TextField id = res.get(0);
94 if (null == id)
95 continue;
96 TextField refId = new TextField(StringUtils.substringAfter(id.value(), "#"), id.source());
97 Node included = document.findNodeById(refId.value());
98 if (null == included)
99 continue;
100 if (DomUtils.isAncestorOf(included, current)) {
101 final int[] nodeLocation = DomUtils.getNodeLocation(current);
102 report.notifyIssue(IssueReport.IssueLevel.WARNING, "Current node tries to include an ancestor node.",
103 nodeLocation == null ? -1 : nodeLocation[0], nodeLocation == null ? -1 : nodeLocation[1]);
104 continue;
105 }
106 current.appendChild(included.cloneNode(true));
107 }
108 }
109
110 @Override
111 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
112 this.fragment = new HTMLDocument(node);
113 fixIncludes(getHTMLDocument(), node, out);
114 final BNode card = getBlankNodeFor(node);
115 boolean foundSomething = false;
116
117 readFn();
118 readNames();
119 readOrganization();
120 foundSomething |= addFn(card);
121 foundSomething |= addNames(card);
122 foundSomething |= addOrganizationName(card);
123 foundSomething |= addStringProperty("sort-string", card, vCARD.sort_string);
124 foundSomething |= addUrl(card);
125 foundSomething |= addEmail(card);
126 foundSomething |= addPhoto(card);
127 foundSomething |= addLogo(card);
128 foundSomething |= addUid(card);
129 foundSomething |= addClass(card);
130 foundSomething |= addStringProperty("bday", card, vCARD.bday);
131 foundSomething |= addStringProperty("rev", card, vCARD.rev);
132 foundSomething |= addStringProperty("tz", card, vCARD.tz);
133 foundSomething |= addCategory(card);
134 foundSomething |= addStringProperty("card", card, vCARD.class_);
135 foundSomething |= addSubMicroformat("adr", card, vCARD.adr);
136 foundSomething |= addTelephones(card);
137 foundSomething |= addStringProperty("title", card, vCARD.title);
138 foundSomething |= addStringProperty("role", card, vCARD.role);
139 foundSomething |= addStringMultiProperty("note", card, vCARD.note);
140 foundSomething |= addSubMicroformat("geo", card, vCARD.geo);
141
142 if (!foundSomething)
143 return false;
144 out.writeTriple(card, RDF.TYPE, vCARD.VCard);
145
146 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
147 tser.addResourceRoot(DomUtils.getXPathListForNode(node), card, this.getClass());
148
149 return true;
150 }
151
152 private boolean addTelephones(Resource card) {
153 boolean found = false;
154 for (Node node : DomUtils.findAllByAttributeContains(fragment.getDocument(), "class", "tel")) {
155 HTMLDocument telFragment = new HTMLDocument(node);
156 TextField[] values = telFragment.getPluralUrlField("value");
157 if (values.length == 0) {
158
159 String[] typeAndValue = telFragment.getSingularUrlField("tel").value().split(":");
160
161 if (typeAndValue.length > 1) {
162 found |= addTel(card, "tel", typeAndValue[1]);
163 } else {
164 found |= addTel(card, "tel", typeAndValue[0]);
165 }
166 } else {
167 final String[] valuesStr = new String[values.length];
168 for (int i = 0; i < values.length; i++) {
169 valuesStr[i] = values[i].value();
170 }
171 HTMLDocument.TextField[] types = telFragment.getPluralTextField("type");
172 if (types.length == 0) {
173 found |= addTel(card, "tel", StringUtils.join(valuesStr));
174 }
175 for (HTMLDocument.TextField type : types) {
176 found |= addTel(card, type.value(), StringUtils.join(valuesStr));
177 }
178 }
179 }
180 return found;
181 }
182
183 private boolean addTel(Resource card, String type, String value) {
184 IRI tel = super.fixLink(value, "tel");
185 IRI composed = vCARD.getProperty(type + "Tel", null);
186 if (composed == null) {
187 IRI simple = vCARD.getProperty(type, null);
188 if (simple == null) {
189 return conditionallyAddResourceProperty(card, vCARD.tel, tel);
190 }
191 return conditionallyAddResourceProperty(card, simple, tel);
192 }
193 return conditionallyAddResourceProperty(card, composed, tel);
194 }
195
196 private boolean addSubMicroformat(String className, Resource resource, IRI property) {
197 List<Node> nodes = fragment.findAllByClassName(className);
198 if (nodes.isEmpty())
199 return false;
200 for (Node node : nodes) {
201 addBNodeProperty(node, resource, property, getBlankNodeFor(node));
202 }
203 return true;
204 }
205
206 private boolean addStringProperty(String className, Resource resource, IRI property) {
207 final HTMLDocument.TextField textField = fragment.getSingularTextField(className);
208 return conditionallyAddStringProperty(textField.source(), resource, property, textField.value());
209 }
210
211
212
213
214
215
216
217
218
219
220 private boolean addStringMultiProperty(String className, Resource resource, IRI property) {
221 HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
222 boolean found = false;
223 for (HTMLDocument.TextField field : fields) {
224 found |= conditionallyAddStringProperty(field.source(), resource, property, field.value());
225 }
226 return found;
227 }
228
229 private boolean addCategory(Resource card) {
230 HTMLDocument.TextField[] categories = fragment.getPluralTextField("category");
231 boolean found = false;
232 for (HTMLDocument.TextField category : categories) {
233 found |= conditionallyAddStringProperty(category.source(), card, vCARD.category, category.value());
234 }
235 return found;
236 }
237
238 private boolean addUid(Resource card) {
239 TextField uid = fragment.getSingularUrlField("uid");
240 return conditionallyAddStringProperty(fragment.getDocument(), card, vCARD.uid, uid.value());
241 }
242
243 private boolean addClass(Resource card) {
244 TextField class_ = fragment.getSingularUrlField("class");
245 return conditionallyAddStringProperty(fragment.getDocument(), card, vCARD.class_, class_.value());
246 }
247
248 private boolean addLogo(Resource card) throws ExtractionException {
249 TextField[] links = fragment.getPluralUrlField("logo");
250 boolean found = false;
251 for (TextField link : links) {
252 found |= conditionallyAddResourceProperty(card, vCARD.logo, getHTMLDocument().resolveIRI(link.value()));
253 }
254 return found;
255 }
256
257 private boolean addPhoto(Resource card) throws ExtractionException {
258 TextField[] links = fragment.getPluralUrlField("photo");
259 boolean found = false;
260 for (TextField link : links) {
261 found |= conditionallyAddResourceProperty(card, vCARD.photo, getHTMLDocument().resolveIRI(link.value()));
262 }
263 return found;
264 }
265
266 private boolean addEmail(Resource card) {
267 String email = dropSubject(fragment.getSingularUrlField("email").value());
268 return conditionallyAddResourceProperty(card, vCARD.email, fixLink(email, "mailto"));
269 }
270
271 private String dropSubject(String mail) {
272 if (mail == null)
273 return null;
274 return mail.split("\\?")[0];
275 }
276
277 private void readNames() {
278 for (String field : HCardName.FIELDS) {
279 HTMLDocument.TextField[] values = fragment.getPluralTextField(field);
280 for (HTMLDocument.TextField text : values) {
281 if ("".equals(text.value()))
282 continue;
283 name.setField(field, text);
284 }
285 }
286 }
287
288 private void addFieldTriple(Node n, BNode bn, String fieldName, String fieldValue) {
289 conditionallyAddLiteralProperty(n, bn, vCARD.getProperty(fieldName), valueFactory.createLiteral(fieldValue));
290 }
291
292 private boolean addNames(Resource card) {
293 BNode n = valueFactory.createBNode();
294 addBNodeProperty(this.fragment.getDocument(), card, vCARD.n, n);
295 addIRIProperty(n, RDF.TYPE, vCARD.Name);
296
297 for (String fieldName : HCardName.FIELDS) {
298 if (!name.containsField(fieldName)) {
299 continue;
300 }
301 if (name.isMultiField(fieldName)) {
302 Collection<HTMLDocument.TextField> values = name.getFields(fieldName);
303 for (TextField value : values) {
304 addFieldTriple(value.source(), n, fieldName, value.value());
305 }
306 } else {
307 TextField value = name.getField(fieldName);
308 if (value == null) {
309 continue;
310 }
311 addFieldTriple(value.source(), n, fieldName, value.value());
312 }
313 }
314 return true;
315 }
316
317 private void readFn() {
318 name.setFullName(fragment.getSingularTextField("fn"));
319 }
320
321 private boolean addFn(Resource card) {
322 final TextField fullNameTextField = name.getFullName();
323 if (fullNameTextField == null) {
324 return false;
325 }
326 return conditionallyAddStringProperty(fullNameTextField.source(), card, vCARD.fn, fullNameTextField.value());
327 }
328
329 private void readOrganization() {
330 Node node = fragment.findMicroformattedObjectNode("*", "org");
331 if (node == null)
332 return;
333 HTMLDocument doc = new HTMLDocument(node);
334 String nodeText = doc.getText();
335 if (nodeText != null) {
336 name.setOrganization(new HTMLDocument.TextField(nodeText, node));
337 }
338 nodeText = doc.getSingularTextField("organization-name").value();
339 if (nodeText == null || "".equals(nodeText)) {
340 nodeText = HTMLDocument.readTextField(node).value();
341 }
342 name.setOrganization(new TextField(nodeText, node));
343
344 name.setOrganizationUnit(doc.getSingularTextField("organization-unit"));
345 }
346
347 private boolean addOrganizationName(Resource card) {
348 if (name.getOrganization() == null)
349 return false;
350 BNode org = valueFactory.createBNode();
351 addBNodeProperty(this.fragment.getDocument(), card, vCARD.org, org);
352 addIRIProperty(org, RDF.TYPE, vCARD.Organization);
353 final TextField organizationTextField = name.getOrganization();
354 conditionallyAddLiteralProperty(organizationTextField.source(), org, vCARD.organization_name,
355 valueFactory.createLiteral(organizationTextField.value()));
356 final TextField organizationUnitTextField = name.getOrganizationUnit();
357 if (organizationUnitTextField != null) {
358 conditionallyAddStringProperty(organizationUnitTextField.source(), org, vCARD.organization_unit,
359 organizationUnitTextField.value());
360 }
361 return true;
362 }
363
364 private boolean addUrl(Resource card) throws ExtractionException {
365 TextField[] links = fragment.getPluralUrlField("url");
366 boolean found = false;
367 for (TextField link : links) {
368 found |= conditionallyAddResourceProperty(card, vCARD.url, getHTMLDocument().resolveIRI(link.value()));
369 }
370 return found;
371 }
372
373 }