1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.TagSoupExtractionResult;
24 import org.apache.any23.vocab.FOAF;
25 import org.apache.any23.vocab.HListing;
26 import org.eclipse.rdf4j.model.BNode;
27 import org.eclipse.rdf4j.model.Resource;
28 import org.eclipse.rdf4j.model.IRI;
29 import org.eclipse.rdf4j.model.vocabulary.RDF;
30 import org.w3c.dom.Node;
31
32 import java.util.ArrayList;
33 import java.util.Arrays;
34 import java.util.HashSet;
35 import java.util.List;
36 import java.util.Set;
37
38 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
39
40
41
42
43
44
45 public class HListingExtractor extends EntityBasedMicroformatExtractor {
46
47 private static final HListing hLISTING = HListing.getInstance();
48 private static final FOAF foaf = FOAF.getInstance();
49
50 private static final Set<String> ActionClasses = new HashSet<String>() {
51 {
52 add("sell");
53 add("rent");
54 add("trade");
55 add("meet");
56 add("announce");
57 add("offer");
58 add("wanted");
59 add("event");
60 add("service");
61 }
62 };
63
64 private static final List<String> validClassesForAddress = Arrays.asList("post-office-box", "extended-address",
65 "street-address", "locality", "region", "postal-code", "country-name");
66
67 private HTMLDocument fragment;
68
69 @Override
70 public ExtractorDescription getDescription() {
71 return HListingExtractorFactory.getDescriptionInstance();
72 }
73
74 protected String getBaseClassName() {
75 return "hlisting";
76 }
77
78 @Override
79 protected void resetExtractor() {
80
81 }
82
83 @Override
84 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
85 this.fragment = new HTMLDocument(node);
86 BNode listing = getBlankNodeFor(node);
87 out.writeTriple(listing, RDF.TYPE, hLISTING.Listing);
88
89 for (String action : findActions(fragment)) {
90 out.writeTriple(listing, hLISTING.action, hLISTING.getClass(action));
91 }
92 out.writeTriple(listing, hLISTING.lister, addLister());
93 addItem(listing);
94 addDateTimes(listing);
95 addPrice(listing);
96 addDescription(listing);
97 addSummary(listing);
98 addPermalink(listing);
99
100 final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
101 tser.addResourceRoot(DomUtils.getXPathListForNode(node), listing, this.getClass());
102
103 return true;
104 }
105
106 private void addItem(Resource listing) throws ExtractionException {
107 Node node = fragment.findMicroformattedObjectNode("*", "item");
108 if (null == node)
109 return;
110 BNode blankItem = valueFactory.createBNode();
111 addBNodeProperty(node, listing, hLISTING.item, blankItem);
112 addIRIProperty(blankItem, RDF.TYPE, hLISTING.Item);
113
114 HTMLDocumentml/HTMLDocument.html#HTMLDocument">HTMLDocument item = new HTMLDocument(node);
115
116 addItemName(item, blankItem);
117 addItemUrl(item, blankItem);
118
119 addItemPhoto(fragment, blankItem);
120 addItemAddresses(fragment, blankItem);
121 }
122
123 private void addItemAddresses(HTMLDocument doc, Resource blankItem) {
124 final String extractorName = getDescription().getExtractorName();
125 for (Node node : doc.findAll(".//*[contains(@class,'adr')]//*[@class]")) {
126 String[] klasses = node.getAttributes().getNamedItem("class").getNodeValue().split("\\s+");
127 for (String klass : klasses)
128 if (validClassesForAddress.contains(klass)) {
129 String value = node.getNodeValue();
130
131 if (!(null == value || "".equals(value))) {
132 IRI property = hLISTING.getPropertyCamelCase(klass);
133 conditionallyAddLiteralProperty(node, blankItem, property, valueFactory.createLiteral(value));
134 }
135 }
136 }
137 }
138
139 private void addPermalink(Resource listing) {
140 String link = fragment.find(".//A[contains(@rel,'self') and contains(@rel,'bookmark')]/@href");
141 conditionallyAddStringProperty(fragment.getDocument(), listing, hLISTING.permalink, link);
142 }
143
144 private void addPrice(Resource listing) {
145 TextField price = fragment.getSingularTextField("price");
146 conditionallyAddStringProperty(price.source(), listing, hLISTING.price, price.value());
147 }
148
149 private void addDescription(Resource listing) {
150 TextField description = fragment.getSingularTextField("description");
151 conditionallyAddStringProperty(description.source(), listing, hLISTING.description, description.value());
152 }
153
154 private void addSummary(Resource listing) {
155 TextField summary = fragment.getSingularTextField("summary");
156 conditionallyAddStringProperty(summary.source(), listing, hLISTING.summary, summary.value());
157 }
158
159 private void addDateTimes(Resource listing) {
160 TextField listed = fragment.getSingularTextField("dtlisted");
161 conditionallyAddStringProperty(listed.source(), listing, hLISTING.dtlisted, listed.value());
162 HTMLDocument.TextField expired = fragment.getSingularTextField("dtexpired");
163 conditionallyAddStringProperty(expired.source(), listing, hLISTING.dtexpired, expired.value());
164 }
165
166 private Resource addLister() throws ExtractionException {
167 Resource blankLister = valueFactory.createBNode();
168 addIRIProperty(blankLister, RDF.TYPE, hLISTING.Lister);
169 Node node = fragment.findMicroformattedObjectNode("*", "lister");
170 if (null == node)
171 return blankLister;
172 HTMLDocumentLDocument.html#HTMLDocument">HTMLDocument listerNode = new HTMLDocument(node);
173 addListerFn(listerNode, blankLister);
174 addListerOrg(listerNode, blankLister);
175 addListerEmail(listerNode, blankLister);
176 addListerUrl(listerNode, blankLister);
177 addListerTel(listerNode, blankLister);
178 addListerLogo(listerNode, blankLister);
179 return blankLister;
180 }
181
182 private void addListerTel(HTMLDocument doc, Resource blankLister) {
183 HTMLDocument.TextField tel = doc.getSingularTextField("tel");
184 conditionallyAddStringProperty(tel.source(), blankLister, hLISTING.tel, tel.value());
185 }
186
187 private void addListerUrl(HTMLDocument doc, Resource blankLister) throws ExtractionException {
188 TextField url = doc.getSingularUrlField("url");
189 conditionallyAddResourceProperty(blankLister, hLISTING.listerUrl, getHTMLDocument().resolveIRI(url.value()));
190 }
191
192 private void addListerEmail(HTMLDocument doc, Resource blankLister) {
193 TextField email = doc.getSingularUrlField("email");
194 conditionallyAddResourceProperty(blankLister, foaf.mbox, fixLink(email.value(), "mailto"));
195 }
196
197 private void addListerFn(HTMLDocument doc, Resource blankLister) {
198 TextField fn = doc.getSingularTextField("fn");
199 conditionallyAddStringProperty(fn.source(), blankLister, hLISTING.listerName, fn.value());
200 }
201
202 private void addListerLogo(HTMLDocument doc, Resource blankLister) throws ExtractionException {
203 TextField logo = doc.getSingularUrlField("logo");
204 conditionallyAddResourceProperty(blankLister, hLISTING.listerLogo, getHTMLDocument().resolveIRI(logo.value()));
205 }
206
207 private void addListerOrg(HTMLDocument doc, Resource blankLister) {
208 TextField org = doc.getSingularTextField("org");
209 conditionallyAddStringProperty(org.source(), blankLister, hLISTING.listerOrg, org.value());
210 }
211
212 private void addItemName(HTMLDocument item, Resource blankItem) {
213 HTMLDocument.TextField fn = item.getSingularTextField("fn");
214 conditionallyAddStringProperty(fn.source(), blankItem, hLISTING.itemName, fn.value());
215 }
216
217 private void addItemUrl(HTMLDocument item, Resource blankItem) throws ExtractionException {
218 TextField url = item.getSingularUrlField("url");
219 conditionallyAddResourceProperty(blankItem, hLISTING.itemUrl, getHTMLDocument().resolveIRI(url.value()));
220 }
221
222 private void addItemPhoto(HTMLDocument doc, Resource blankLister) throws ExtractionException {
223
224 String url = doc.findMicroformattedValue("*", "item", "A", "photo", "@href");
225 conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveIRI(url));
226 url = doc.findMicroformattedValue("*", "item", "IMG", "photo", "@src");
227 conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveIRI(url));
228
229 url = doc.findMicroformattedValue("*", "photo", "IMG", "", "@src");
230 conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveIRI(url));
231 }
232
233 private List<String> findActions(HTMLDocument doc) {
234 List<String> actions = new ArrayList<String>(0);
235
236 String[] classes = doc.readAttribute("class").split("\\s+");
237 for (String klass : classes) {
238 if (ActionClasses.contains(klass))
239 actions.add(klass);
240 }
241
242 for (Node action : doc.findAll("./*[@class]/@class")) {
243 for (String substring : action.getNodeValue().split("\\s+")) {
244 if (ActionClasses.contains(substring))
245 actions.add(substring);
246 }
247 }
248 return actions;
249 }
250
251 }