1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html.microformats2;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.TagSoupExtractionResult;
24 import org.apache.any23.vocab.HItem;
25 import org.eclipse.rdf4j.model.BNode;
26 import org.eclipse.rdf4j.model.IRI;
27 import org.eclipse.rdf4j.model.vocabulary.RDF;
28 import org.w3c.dom.Node;
29 import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
30 import org.apache.any23.extractor.html.HTMLDocument;
31
32
33
34
35
36
37 public class HItemExtractor extends EntityBasedMicroformatExtractor {
38
39 private static final HItem vHITEM = HItem.getInstance();
40
41 private static final String[] itemFields = { "name", "url", "photo" };
42
43 @Override
44 public ExtractorDescription getDescription() {
45 return HItemExtractorFactory.getDescriptionInstance();
46 }
47
48 protected String getBaseClassName() {
49 return Microformats2Prefixes.CLASS_PREFIX + "item";
50 }
51
52 @Override
53 protected void resetExtractor() {
54
55 }
56
57 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
58 if (null == node)
59 return false;
60 final HTMLDocument document = new HTMLDocument(node);
61 BNode item = getBlankNodeFor(node);
62 out.writeTriple(item, RDF.TYPE, vHITEM.Item);
63 final String extractorName = getDescription().getExtractorName();
64 addName(document, item);
65 addPhotos(document, item);
66 addUrls(document, item);
67 final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
68 tser.addResourceRoot(document.getPathToLocalRoot(), item, this.getClass());
69 return true;
70 }
71
72 private void mapFieldWithProperty(HTMLDocument fragment, BNode item, String fieldClass, IRI property) {
73 HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
74 conditionallyAddStringProperty(title.source(), item, property, title.value());
75 }
76
77 private void addName(HTMLDocument fragment, BNode item) {
78 mapFieldWithProperty(fragment, item, Microformats2Prefixes.PROPERTY_PREFIX + itemFields[0], vHITEM.name);
79 }
80
81 private void addPhotos(HTMLDocument fragment, BNode item) throws ExtractionException {
82 final HTMLDocument.TextField[] photos = fragment
83 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + itemFields[2]);
84 for (HTMLDocument.TextField photo : photos) {
85 addIRIProperty(item, vHITEM.photo, fragment.resolveIRI(photo.value()));
86 }
87 }
88
89 private void addUrls(HTMLDocument fragment, BNode item) throws ExtractionException {
90 HTMLDocument.TextField[] links = fragment
91 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + itemFields[1]);
92 for (HTMLDocument.TextField link : links) {
93 conditionallyAddResourceProperty(item, vHITEM.url, getHTMLDocument().resolveIRI(link.value()));
94 }
95 }
96 }