1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html.microformats2;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
24 import org.apache.any23.extractor.html.HTMLDocument;
25 import org.apache.any23.vocab.HProduct;
26 import org.eclipse.rdf4j.model.BNode;
27 import org.eclipse.rdf4j.model.Resource;
28 import org.eclipse.rdf4j.model.IRI;
29 import org.eclipse.rdf4j.model.vocabulary.RDF;
30 import org.w3c.dom.Node;
31
32 import java.util.List;
33
34
35
36
37
38
39 public class HProductExtractor extends EntityBasedMicroformatExtractor {
40
41 private static final HProduct vProduct = HProduct.getInstance();
42
43 private static final String[] productFields = { "name", "photo", "brand", "category", "description", "url",
44 "identifier", "review",
45 "price" };
46
47 @Override
48 public ExtractorDescription getDescription() {
49 return HProductExtractorFactory.getDescriptionInstance();
50 }
51
52 @Override
53 protected String getBaseClassName() {
54 return Microformats2Prefixes.CLASS_PREFIX + "product";
55 }
56
57 @Override
58 protected void resetExtractor() {
59
60 }
61
62 @Override
63 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
64 final BNode product = getBlankNodeFor(node);
65 conditionallyAddResourceProperty(product, RDF.TYPE, vProduct.product);
66 final HTMLDocument fragment = new HTMLDocument(node);
67 addName(fragment, product);
68 addPhoto(fragment, product);
69 addCategories(fragment, product);
70 addDescription(fragment, product);
71 addURLs(fragment, product);
72 addIdentifiers(fragment, product);
73 addPrice(fragment, product);
74 addBrand(fragment, product);
75 return true;
76 }
77
78 private void mapFieldWithProperty(HTMLDocument fragment, BNode product, String fieldClass, IRI property) {
79 HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
80 conditionallyAddStringProperty(title.source(), product, property, title.value());
81 }
82
83 private void addName(HTMLDocument fragment, BNode product) {
84 mapFieldWithProperty(fragment, product, Microformats2Prefixes.PROPERTY_PREFIX + productFields[0],
85 vProduct.name);
86 }
87
88 private void addPhoto(HTMLDocument fragment, BNode product) throws ExtractionException {
89 final HTMLDocument.TextField[] photos = fragment
90 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[1]);
91 for (HTMLDocument.TextField photo : photos) {
92 addIRIProperty(product, vProduct.photo, fragment.resolveIRI(photo.value()));
93 }
94 }
95
96 private void addCategories(HTMLDocument fragment, BNode product) {
97 final HTMLDocument.TextField[] categories = fragment
98 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + productFields[3]);
99 for (HTMLDocument.TextField category : categories) {
100 conditionallyAddStringProperty(category.source(), product, vProduct.category, category.value());
101 }
102 }
103
104 private void addDescription(HTMLDocument fragment, BNode product) {
105 mapFieldWithProperty(fragment, product, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + productFields[4],
106 vProduct.description);
107 }
108
109 private void addURLs(HTMLDocument fragment, BNode product) throws ExtractionException {
110 final HTMLDocument.TextField[] urls = fragment
111 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[5]);
112 for (HTMLDocument.TextField url : urls) {
113 addIRIProperty(product, vProduct.url, fragment.resolveIRI(url.value()));
114 }
115 }
116
117 private void addIdentifiers(HTMLDocument fragment, BNode product) throws ExtractionException {
118 final HTMLDocument.TextField[] identifiers = fragment
119 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[6]);
120 for (HTMLDocument.TextField identifier : identifiers) {
121 addIRIProperty(product, vProduct.identifier, fragment.resolveIRI(identifier.value()));
122 }
123 }
124
125 private void addPrice(HTMLDocument fragment, BNode product) {
126 final HTMLDocument.TextField price = fragment
127 .getSingularTextField(Microformats2Prefixes.PROPERTY_PREFIX + productFields[8]);
128 if (price.source() == null)
129 return;
130 Node attribute = price.source().getAttributes().getNamedItem("value");
131 if (attribute == null) {
132 conditionallyAddStringProperty(price.source(), product, vProduct.price, price.value());
133 } else {
134 conditionallyAddStringProperty(price.source(), product, vProduct.price, attribute.getNodeValue());
135 }
136 }
137
138 private void addBrand(HTMLDocument doc, Resource product) throws ExtractionException {
139 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + productFields[2]
140 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
141 if (nodes.isEmpty())
142 return;
143 HCardExtractorFactory factory = new HCardExtractorFactory();
144 HCardExtractor extractor = factory.createExtractor();
145 for (Node node : nodes) {
146 BNode brand = valueFactory.createBNode();
147 addIRIProperty(brand, RDF.TYPE, vProduct.brand);
148 extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), brand, getCurrentExtractionResult());
149 }
150 }
151 }