1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.ExtractorDescription;
25 import org.apache.any23.extractor.IssueReport;
26 import org.apache.any23.extractor.TagSoupExtractionResult;
27 import org.apache.any23.extractor.html.annotations.Includes;
28 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
29 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30 import org.eclipse.rdf4j.model.BNode;
31 import org.eclipse.rdf4j.model.Literal;
32 import org.eclipse.rdf4j.model.Resource;
33 import org.eclipse.rdf4j.model.IRI;
34 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
35 import org.w3c.dom.Document;
36 import org.w3c.dom.Node;
37
38 import java.io.IOException;
39 import java.util.Locale;
40
41
42
43
44 public abstract class MicroformatExtractor implements TagSoupDOMExtractor {
45
46 public static final String BEGIN_SCRIPT = "<script>";
47 public static final String END_SCRIPT = "</script>";
48
49 private HTMLDocument htmlDocument;
50
51 private ExtractionContext context;
52
53 private IRI documentIRI;
54
55 private ExtractionResult out;
56
57 protected final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
58 SimpleValueFactory.getInstance());
59
60
61
62
63
64
65 public abstract ExtractorDescription getDescription();
66
67
68
69
70
71
72
73
74
75
76
77 protected abstract boolean extract() throws ExtractionException;
78
79 public HTMLDocument getHTMLDocument() {
80 return htmlDocument;
81 }
82
83 public ExtractionContext getExtractionContext() {
84 return context;
85 }
86
87 public IRI getDocumentIRI() {
88 return documentIRI;
89 }
90
91 public final void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
92 ExtractionResult out) throws IOException, ExtractionException {
93 this.htmlDocument = new HTMLDocument(in);
94 this.context = extractionContext;
95 this.documentIRI = extractionContext.getDocumentIRI();
96 this.out = out;
97 valueFactory.setIssueReport(out);
98 try {
99 extract();
100 } finally {
101 valueFactory.setIssueReport(null);
102 }
103 }
104
105
106
107
108
109
110 protected ExtractionResult getCurrentExtractionResult() {
111 return out;
112 }
113
114 protected void setCurrentExtractionResult(ExtractionResult out) {
115 this.out = out;
116 }
117
118 protected ExtractionResult openSubResult(ExtractionContext context) {
119 return out.openSubResult(context);
120 }
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136 protected boolean conditionallyAddStringProperty(Node n, Resource subject, IRI p, String value) {
137 if (value == null)
138 return false;
139 value = value.trim();
140 return value.length() > 0 && conditionallyAddLiteralProperty(n, subject, p, valueFactory.createLiteral(value));
141 }
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157 protected boolean conditionallyAddLiteralProperty(Node n, Resource subject, IRI property, Literal literal) {
158 final String literalStr = literal.stringValue();
159 if (containsScriptBlock(literalStr)) {
160 out.notifyIssue(IssueReport.IssueLevel.WARNING,
161 String.format(Locale.ROOT, "Detected script in literal: [%s]", literalStr), -1, -1);
162 return false;
163 }
164 out.writeTriple(subject, property, literal);
165 TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
166 tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n));
167 return true;
168 }
169
170
171
172
173
174
175
176
177
178
179
180
181
182 protected boolean conditionallyAddResourceProperty(Resource subject, IRI property, IRI uri) {
183 if (uri == null)
184 return false;
185 out.writeTriple(subject, property, uri);
186 return true;
187 }
188
189
190
191
192
193
194
195
196
197
198
199
200
201 protected void addBNodeProperty(Node n, Resource subject, IRI property, BNode bnode) {
202 out.writeTriple(subject, property, bnode);
203 TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
204 tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n));
205 }
206
207
208
209
210
211
212
213
214
215
216
217 protected void addBNodeProperty(Resource subject, IRI property, BNode bnode) {
218 out.writeTriple(subject, property, bnode);
219 }
220
221
222
223
224
225
226
227
228
229
230
231 protected void addIRIProperty(Resource subject, IRI property, IRI object) {
232 out.writeTriple(subject, property, object);
233 }
234
235 protected IRI fixLink(String link) {
236 return valueFactory.fixLink(link, null);
237 }
238
239 protected IRI fixLink(String link, String defaultSchema) {
240 return valueFactory.fixLink(link, defaultSchema);
241 }
242
243 private boolean containsScriptBlock(String in) {
244 final String inLowerCase = in.toLowerCase(Locale.ROOT);
245 final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
246 if (beginBlock == -1) {
247 return false;
248 }
249 return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
250 }
251
252
253
254
255
256
257
258
259
260
261
262
263
264 public static boolean includes(Class<? extends MicroformatExtractor> including,
265 Class<? extends MicroformatExtractor> included) {
266 Includes includes = including.getAnnotation(Includes.class);
267 if (includes != null) {
268 Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
269 if (extractors != null && extractors.length > 0) {
270 for (Class<? extends MicroformatExtractor> extractor : extractors) {
271 if (extractor.equals(included)) {
272 return true;
273 }
274 }
275 }
276 }
277 return false;
278 }
279
280 }