1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html.microformats2;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
24 import org.apache.any23.vocab.HEvent;
25 import org.apache.any23.vocab.VCard;
26 import org.eclipse.rdf4j.model.BNode;
27 import org.eclipse.rdf4j.model.Resource;
28 import org.eclipse.rdf4j.model.IRI;
29 import org.eclipse.rdf4j.model.vocabulary.RDF;
30 import org.w3c.dom.Node;
31 import org.apache.any23.extractor.html.HTMLDocument;
32
33 import java.util.List;
34
35 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
36
37
38
39
40
41
42 public class HEventExtractor extends EntityBasedMicroformatExtractor {
43
44 private static final HEvent vEvent = HEvent.getInstance();
45 private static final VCard vVCARD = VCard.getInstance();
46
47 private String[] eventFields = { "name", "summary", "start", "end", "duration", "description", "url", "category",
48 "location", "attendee" };
49
50 private static final String[] geoFields = { "latitude", "longitude", "altitude" };
51
52 @Override
53 public ExtractorDescription getDescription() {
54 return HEventExtractorFactory.getDescriptionInstance();
55 }
56
57 @Override
58 protected String getBaseClassName() {
59 return Microformats2Prefixes.CLASS_PREFIX + "event";
60 }
61
62 @Override
63 protected void resetExtractor() {
64
65 }
66
67 @Override
68 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
69 final BNode event = getBlankNodeFor(node);
70 conditionallyAddResourceProperty(event, RDF.TYPE, vEvent.event);
71 final HTMLDocument fragment = new HTMLDocument(node);
72 addName(fragment, event);
73 addSummary(fragment, event);
74 addStart(fragment, event);
75 addEnd(fragment, event);
76 addDuration(fragment, event);
77 addDescription(fragment, event);
78 addURLs(fragment, event);
79 addCategories(fragment, event);
80 addLocations(fragment, event);
81 addAttendees(fragment, event);
82 return true;
83 }
84
85 public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, BNode event, ExtractionResult out)
86 throws ExtractionException {
87 this.setCurrentExtractionResult(out);
88 addName(fragment, event);
89 addSummary(fragment, event);
90 addStart(fragment, event);
91 addEnd(fragment, event);
92 addDuration(fragment, event);
93 addDescription(fragment, event);
94 addURLs(fragment, event);
95 addCategories(fragment, event);
96 addLocations(fragment, event);
97 addAttendees(fragment, event);
98 return event;
99 }
100
101 private void addAttendees(HTMLDocument doc, Resource entry) throws ExtractionException {
102 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + eventFields[9]
103 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
104 if (nodes.isEmpty())
105 return;
106 HCardExtractorFactory factory = new HCardExtractorFactory();
107 HCardExtractor extractor = factory.createExtractor();
108 for (Node node : nodes) {
109 BNode attendee = valueFactory.createBNode();
110 addIRIProperty(attendee, RDF.TYPE, vEvent.attendee);
111 extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), attendee, getCurrentExtractionResult());
112 }
113 }
114
115 private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, IRI property) {
116 HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
117 conditionallyAddStringProperty(title.source(), recipe, property, title.value());
118 }
119
120 private void addName(HTMLDocument fragment, BNode event) {
121 mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + eventFields[0], vEvent.name);
122 }
123
124 private void addSummary(HTMLDocument fragment, BNode event) {
125 mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + eventFields[1], vEvent.summary);
126 }
127
128 private void addStart(HTMLDocument fragment, BNode event) {
129 final TextField start = fragment
130 .getSingularTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + eventFields[2]);
131 if (start.source() == null)
132 return;
133 Node attribute = start.source().getAttributes().getNamedItem("datetime");
134 if (attribute == null) {
135 conditionallyAddStringProperty(start.source(), event, vEvent.start, start.value());
136 } else {
137 conditionallyAddStringProperty(start.source(), event, vEvent.start, attribute.getNodeValue());
138 }
139 }
140
141 private void addEnd(HTMLDocument fragment, BNode event) {
142 final TextField end = fragment
143 .getSingularTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + eventFields[3]);
144 if (end.source() == null)
145 return;
146 Node attribute = end.source().getAttributes().getNamedItem("datetime");
147 if (attribute == null) {
148 conditionallyAddStringProperty(end.source(), event, vEvent.end, end.value());
149 } else {
150 conditionallyAddStringProperty(end.source(), event, vEvent.end, attribute.getNodeValue());
151 }
152 }
153
154 private void addDuration(HTMLDocument fragment, BNode event) {
155 final TextField duration = fragment
156 .getSingularTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + eventFields[4]);
157 if (duration.source() == null)
158 return;
159 Node attribute = duration.source().getAttributes().getNamedItem("datetime");
160 if (attribute == null) {
161 conditionallyAddStringProperty(duration.source(), event, vEvent.duration, duration.value());
162 } else {
163 conditionallyAddStringProperty(duration.source(), event, vEvent.duration, attribute.getNodeValue());
164 }
165 }
166
167 private void addDescription(HTMLDocument fragment, BNode event) {
168 mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + eventFields[5],
169 vEvent.description);
170 }
171
172 private void addURLs(HTMLDocument fragment, BNode event) throws ExtractionException {
173 final HTMLDocument.TextField[] urls = fragment
174 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + eventFields[6]);
175 for (HTMLDocument.TextField url : urls) {
176 addIRIProperty(event, vEvent.url, fragment.resolveIRI(url.value()));
177 }
178 }
179
180 private void addCategories(HTMLDocument fragment, BNode event) {
181 final HTMLDocument.TextField[] categories = fragment
182 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + eventFields[7]);
183 for (HTMLDocument.TextField category : categories) {
184 conditionallyAddStringProperty(category.source(), event, vEvent.category, category.value());
185 }
186 }
187
188 private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException {
189 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + eventFields[8]
190 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "geo");
191 if (nodes.isEmpty())
192 return;
193 for (Node node : nodes) {
194 BNode location = valueFactory.createBNode();
195 addIRIProperty(location, RDF.TYPE, vEvent.location);
196 HTMLDocument fragment = new HTMLDocument(node);
197 for (String field : geoFields) {
198 HTMLDocument.TextField[] values = fragment
199 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
200 for (HTMLDocument.TextField val : values) {
201 Node attribute = val.source().getAttributes().getNamedItem("title");
202 if (attribute == null) {
203 conditionallyAddStringProperty(val.source(), location, vVCARD.getProperty(field), val.value());
204 } else {
205 conditionallyAddStringProperty(val.source(), location, vVCARD.getProperty(field),
206 attribute.getNodeValue());
207 }
208 }
209 }
210 }
211 }
212
213 }