1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.TagSoupExtractionResult;
24 import org.apache.any23.vocab.DCTerms;
25 import org.apache.any23.vocab.Review;
26 import org.apache.any23.vocab.VCard;
27 import org.eclipse.rdf4j.model.BNode;
28 import org.eclipse.rdf4j.model.Resource;
29 import org.eclipse.rdf4j.model.vocabulary.RDF;
30 import org.w3c.dom.Node;
31
32 import java.util.List;
33
34 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
35
36
37
38
39
40
41 public class HReviewExtractor extends EntityBasedMicroformatExtractor {
42
43 private static final Review vREVIEW = Review.getInstance();
44 private static final VCard vVCARD = VCard.getInstance();
45 private static final DCTerms vDCTERMS = DCTerms.getInstance();
46
47 @Override
48 public ExtractorDescription getDescription() {
49 return HReviewExtractorFactory.getDescriptionInstance();
50 }
51
52 @Override
53 protected String getBaseClassName() {
54 return "hreview";
55 }
56
57 @Override
58 protected void resetExtractor() {
59
60 }
61
62 @Override
63 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
64 BNode rev = getBlankNodeFor(node);
65 out.writeTriple(rev, RDF.TYPE, vREVIEW.Review);
66 final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
67 addRating(fragment, rev);
68 addSummary(fragment, rev);
69 addTime(fragment, rev);
70 addType(fragment, rev);
71 addDescription(fragment, rev);
72 addItem(fragment, rev);
73 addReviewer(fragment, rev);
74
75 final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
76 tser.addResourceRoot(DomUtils.getXPathListForNode(node), rev, this.getClass());
77
78 return true;
79 }
80
81 private void addType(HTMLDocument doc, Resource rev) {
82 TextField value = doc.getSingularTextField("type");
83 conditionallyAddStringProperty(value.source(), rev, vREVIEW.type, value.value());
84 }
85
86 private void addReviewer(HTMLDocument doc, Resource rev) {
87 List<Node> nodes = doc.findAllByClassName("reviewer");
88 if (nodes.size() > 0) {
89 Node node0 = nodes.get(0);
90 addBNodeProperty(node0, rev, vREVIEW.reviewer, getBlankNodeFor(node0));
91 }
92 }
93
94 private void addItem(HTMLDocument root, BNode rev) throws ExtractionException {
95 List<Node> nodes = root.findAllByClassName("item");
96 for (Node node : nodes) {
97 Resource item = findDummy(new HTMLDocument(node));
98 addBNodeProperty(node, item, vREVIEW.hasReview, rev);
99 }
100 }
101
102 private Resource findDummy(HTMLDocument item) throws ExtractionException {
103 Resource blank = getBlankNodeFor(item.getDocument());
104 TextField val = item.getSingularTextField("fn");
105 conditionallyAddStringProperty(val.source(), blank, vVCARD.fn, val.value());
106 final TextField url = item.getSingularUrlField("url");
107 conditionallyAddResourceProperty(blank, vVCARD.url, getHTMLDocument().resolveIRI(url.value()));
108 TextField pics[] = item.getPluralUrlField("photo");
109 for (TextField pic : pics) {
110 addIRIProperty(blank, vVCARD.photo, getHTMLDocument().resolveIRI(pic.value()));
111 }
112 return blank;
113 }
114
115 private void addRating(HTMLDocument doc, Resource rev) {
116 HTMLDocument.TextField value = doc.getSingularTextField("rating");
117 conditionallyAddStringProperty(value.source(), rev, vREVIEW.rating, value.value());
118 }
119
120 private void addSummary(HTMLDocument doc, Resource rev) {
121 TextField value = doc.getSingularTextField("summary");
122 conditionallyAddStringProperty(value.source(), rev, vREVIEW.title, value.value());
123 }
124
125 private void addTime(HTMLDocument doc, Resource rev) {
126 TextField value = doc.getSingularTextField("dtreviewed");
127 conditionallyAddStringProperty(value.source(), rev, vDCTERMS.date, value.value());
128 }
129
130 private void addDescription(HTMLDocument doc, Resource rev) {
131 TextField value = doc.getSingularTextField("description");
132 conditionallyAddStringProperty(value.source(), rev, vREVIEW.text, value.value());
133 }
134
135 }