1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html.microformats2;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
24 import org.apache.any23.extractor.html.HTMLDocument;
25 import org.apache.any23.vocab.HEntry;
26 import org.apache.any23.vocab.VCard;
27 import org.eclipse.rdf4j.model.BNode;
28 import org.eclipse.rdf4j.model.IRI;
29 import org.eclipse.rdf4j.model.vocabulary.RDF;
30 import org.w3c.dom.Node;
31 import org.eclipse.rdf4j.model.Resource;
32
33 import java.util.List;
34
35
36
37
38
39
40 public class HEntryExtractor extends EntityBasedMicroformatExtractor {
41
42 private static final HEntry vEntry = HEntry.getInstance();
43 private static final VCard vVCARD = VCard.getInstance();
44
45 private static final String[] entryFields = { "name", "summary", "content", "published", "updated", "category",
46 "url", "uid", "syndication", "in-reply-to", "author", "location",
47
48 };
49
50 private static final String[] geoFields = { "latitude", "longitude", "altitude" };
51
52 @Override
53 public ExtractorDescription getDescription() {
54 return HEntryExtractorFactory.getDescriptionInstance();
55 }
56
57 @Override
58 protected String getBaseClassName() {
59 return Microformats2Prefixes.CLASS_PREFIX + "entry";
60 }
61
62 @Override
63 protected void resetExtractor() {
64
65 }
66
67 @Override
68 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
69 final BNode entry = getBlankNodeFor(node);
70 conditionallyAddResourceProperty(entry, RDF.TYPE, vEntry.Entry);
71 final HTMLDocument fragment = new HTMLDocument(node);
72 addName(fragment, entry);
73 addSummary(fragment, entry);
74 addContent(fragment, entry);
75 addPublished(fragment, entry);
76 addUpdated(fragment, entry);
77 addCategories(fragment, entry);
78 addURLs(fragment, entry);
79 addUID(fragment, entry);
80 addSyndications(fragment, entry);
81 addInReplyTo(fragment, entry);
82 addLocations(fragment, entry);
83 addAuthors(fragment, entry);
84 return true;
85 }
86
87 private void addAuthors(HTMLDocument doc, Resource entry) throws ExtractionException {
88 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[10]
89 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
90 if (nodes.isEmpty())
91 return;
92 HCardExtractorFactory factory = new HCardExtractorFactory();
93 HCardExtractor extractor = factory.createExtractor();
94 for (Node node : nodes) {
95 BNode author = valueFactory.createBNode();
96 addIRIProperty(author, RDF.TYPE, vEntry.author);
97 extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), author, getCurrentExtractionResult());
98 }
99 }
100
101 private void mapFieldWithProperty(HTMLDocument fragment, BNode entry, String fieldClass, IRI property) {
102 HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
103 conditionallyAddStringProperty(title.source(), entry, property, title.value());
104 }
105
106 private void addName(HTMLDocument fragment, BNode entry) {
107 mapFieldWithProperty(fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + entryFields[0], vEntry.name);
108 }
109
110 private void addSummary(HTMLDocument fragment, BNode entry) {
111 mapFieldWithProperty(fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + entryFields[1], vEntry.summary);
112 }
113
114 private void addContent(HTMLDocument fragment, BNode entry) {
115 mapFieldWithProperty(fragment, entry, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + entryFields[2],
116 vEntry.content);
117 }
118
119 private void addPublished(HTMLDocument fragment, BNode entry) {
120 final HTMLDocument.TextField[] durations = fragment
121 .getPluralTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[3]);
122 for (HTMLDocument.TextField duration : durations) {
123 Node attribute = duration.source().getAttributes().getNamedItem("datetime");
124 if (attribute == null) {
125 conditionallyAddStringProperty(duration.source(), entry, vEntry.published, duration.value());
126 } else {
127 conditionallyAddStringProperty(duration.source(), entry, vEntry.published, attribute.getNodeValue());
128 }
129 }
130 }
131
132 private void addUpdated(HTMLDocument fragment, BNode entry) {
133 final HTMLDocument.TextField[] durations = fragment
134 .getPluralTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[4]);
135 for (HTMLDocument.TextField duration : durations) {
136 Node attribute = duration.source().getAttributes().getNamedItem("datetime");
137 if (attribute == null) {
138 conditionallyAddStringProperty(duration.source(), entry, vEntry.updated, duration.value());
139 } else {
140 conditionallyAddStringProperty(duration.source(), entry, vEntry.updated, attribute.getNodeValue());
141 }
142 }
143 }
144
145 private void addCategories(HTMLDocument fragment, BNode entry) {
146 final HTMLDocument.TextField[] categories = fragment
147 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[5]);
148 for (HTMLDocument.TextField category : categories) {
149 conditionallyAddStringProperty(category.source(), entry, vEntry.category, category.value());
150 }
151 }
152
153 private void addURLs(HTMLDocument fragment, BNode entry) throws ExtractionException {
154 final HTMLDocument.TextField[] urls = fragment
155 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[6]);
156 for (HTMLDocument.TextField url : urls) {
157 addIRIProperty(entry, vEntry.url, fragment.resolveIRI(url.value()));
158 }
159 }
160
161 private void addUID(HTMLDocument fragment, BNode entry) throws ExtractionException {
162 final HTMLDocument.TextField uid = fragment
163 .getSingularTextField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[7]);
164 if (uid.source() == null)
165 return;
166 addIRIProperty(entry, vEntry.uid, fragment.resolveIRI(uid.value()));
167 }
168
169 private void addSyndications(HTMLDocument fragment, BNode entry) throws ExtractionException {
170 final HTMLDocument.TextField[] syndications = fragment
171 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[8]);
172 for (HTMLDocument.TextField syndication : syndications) {
173 addIRIProperty(entry, vEntry.syndication, fragment.resolveIRI(syndication.value()));
174 }
175 }
176
177 private void addInReplyTo(HTMLDocument fragment, BNode entry) throws ExtractionException {
178 final HTMLDocument.TextField inReplyTo = fragment
179 .getSingularTextField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[9]);
180 if (inReplyTo.source() == null)
181 return;
182 addIRIProperty(entry, vEntry.in_reply_to, fragment.resolveIRI(inReplyTo.value()));
183 }
184
185 private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException {
186 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[11]
187 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "geo");
188 if (nodes.isEmpty())
189 return;
190 for (Node node : nodes) {
191 BNode location = valueFactory.createBNode();
192 addIRIProperty(location, RDF.TYPE, vEntry.location);
193 HTMLDocument fragment = new HTMLDocument(node);
194 for (String field : geoFields) {
195 HTMLDocument.TextField[] values = fragment
196 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
197 for (HTMLDocument.TextField val : values) {
198 Node attribute = val.source().getAttributes().getNamedItem("title");
199 if (attribute == null) {
200 conditionallyAddStringProperty(val.source(), location, vVCARD.getProperty(field), val.value());
201 } else {
202 conditionallyAddStringProperty(val.source(), location, vVCARD.getProperty(field),
203 attribute.getNodeValue());
204 }
205 }
206 }
207 }
208 }
209 }