1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractorDescription;
22 import org.apache.any23.extractor.TagSoupExtractionResult;
23 import org.apache.any23.rdf.RDFUtils;
24 import org.apache.any23.vocab.ICAL;
25 import org.eclipse.rdf4j.model.BNode;
26 import org.eclipse.rdf4j.model.Resource;
27 import org.eclipse.rdf4j.model.IRI;
28 import org.eclipse.rdf4j.model.vocabulary.RDF;
29 import org.w3c.dom.Node;
30
31 import javax.xml.datatype.DatatypeConfigurationException;
32 import java.text.ParseException;
33 import java.util.List;
34
35 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
36
37
38
39
40
41
42 public class HCalendarExtractor extends MicroformatExtractor {
43
44 private static final ICAL vICAL = ICAL.getInstance();
45
46 private static final String[] Components = { "Vevent", "Vtodo", "Vjournal", "Vfreebusy" };
47
48 private static final String DATE_FORMAT = "yyyyMMdd'T'HHmm'Z'";
49
50 private String[] textSingularProps = { "summary", "class", "transp", "description", "status", "location" };
51
52 private String[] textDateProps = { "dtstart", "dtstamp", "dtend", };
53
54 @Override
55 public ExtractorDescription getDescription() {
56 return HCalendarExtractorFactory.getDescriptionInstance();
57 }
58
59 @Override
60 protected boolean extract() throws ExtractionException {
61 final HTMLDocument document = getHTMLDocument();
62 List<Node> calendars = document.findAllByClassName("vcalendar");
63 if (calendars.size() == 0)
64
65
66 if (document.findAllByClassName("vevent").size() > 0)
67 calendars.add(document.getDocument());
68
69 boolean foundAny = false;
70 for (Node node : calendars)
71 foundAny |= extractCalendar(node);
72
73 return foundAny;
74 }
75
76 private boolean extractCalendar(Node node) throws ExtractionException {
77 IRI cal = getDocumentIRI();
78 addIRIProperty(cal, RDF.TYPE, vICAL.Vcalendar);
79 return addComponents(node, cal);
80 }
81
82 private boolean addComponents(Node node, Resource cal) throws ExtractionException {
83 boolean foundAny = false;
84 for (String component : Components) {
85 List<Node> events = DomUtils.findAllByClassName(node, component);
86 if (events.size() == 0)
87 continue;
88 for (Node evtNode : events)
89 foundAny |= extractComponent(evtNode, cal, component);
90 }
91 return foundAny;
92 }
93
94 private boolean extractComponent(Node node, Resource cal, String component) throws ExtractionException {
95 HTMLDocument compoNode = new HTMLDocument(node);
96 BNode evt = valueFactory.createBNode();
97 addIRIProperty(evt, RDF.TYPE, vICAL.getClass(component));
98 addTextProps(compoNode, evt);
99 addUrl(compoNode, evt);
100 addRRule(compoNode, evt);
101 addOrganizer(compoNode, evt);
102 addUid(compoNode, evt);
103 addBNodeProperty(cal, vICAL.component, evt);
104
105 final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
106 tser.addResourceRoot(compoNode.getPathToLocalRoot(), evt, this.getClass());
107
108 return true;
109 }
110
111 private void addUid(HTMLDocument compoNode, Resource evt) {
112 TextField url = compoNode.getSingularUrlField("uid");
113 conditionallyAddStringProperty(compoNode.getDocument(), evt, vICAL.uid, url.value());
114 }
115
116 private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
117 TextField url = compoNode.getSingularUrlField("url");
118 if ("".equals(url.value()))
119 return;
120 addIRIProperty(evt, vICAL.url, getHTMLDocument().resolveIRI(url.value()));
121 }
122
123 private void addRRule(HTMLDocument compoNode, Resource evt) {
124 for (Node rule : compoNode.findAllByClassName("rrule")) {
125 BNode rrule = valueFactory.createBNode();
126 addIRIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
127 TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
128 conditionallyAddStringProperty(freq.source(), rrule, vICAL.freq, freq.value());
129 addBNodeProperty(rule, evt, vICAL.rrule, rrule);
130 }
131 }
132
133 private void addOrganizer(HTMLDocument compoNode, Resource evt) {
134 for (Node organizer : compoNode.findAllByClassName("organizer")) {
135
136 BNode blank = valueFactory.createBNode();
137 TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
138 conditionallyAddStringProperty(compoNode.getDocument(), blank, vICAL.calAddress, mail.value());
139 addBNodeProperty(organizer, evt, vICAL.organizer, blank);
140 }
141 }
142
143 private void addTextProps(HTMLDocument node, Resource evt) {
144 for (String date : textSingularProps) {
145 HTMLDocument.TextField val = node.getSingularTextField(date);
146 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
147 }
148
149 for (String date : textDateProps) {
150 HTMLDocument.TextField val = node.getSingularTextField(date);
151 try {
152 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date),
153 RDFUtils.getXSDDate(val.value(), DATE_FORMAT));
154 } catch (ParseException e) {
155
156 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
157 } catch (DatatypeConfigurationException e) {
158
159 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
160 }
161 }
162
163 HTMLDocument.TextField[] values = node.getPluralTextField("category");
164 for (TextField val : values) {
165 conditionallyAddStringProperty(val.source(), evt, vICAL.categories, val.value());
166 }
167 }
168
169 }