1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor;
19
20 import org.apache.any23.AbstractAny23TestBase;
21 import org.apache.any23.configuration.DefaultConfiguration;
22 import org.apache.any23.configuration.ModifiableConfiguration;
23 import org.apache.any23.extractor.html.HTMLFixture;
24 import org.apache.any23.extractor.rdf.TriXExtractor;
25 import org.apache.any23.mime.TikaMIMETypeDetector;
26 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
27 import org.apache.any23.vocab.ICAL;
28 import org.apache.any23.vocab.Review;
29 import org.apache.any23.vocab.SINDICE;
30 import org.apache.any23.vocab.VCard;
31 import org.apache.any23.writer.CompositeTripleHandler;
32 import org.apache.any23.writer.RDFXMLWriter;
33 import org.apache.any23.writer.RepositoryWriter;
34 import org.apache.any23.writer.TripleHandlerException;
35 import org.junit.After;
36 import org.junit.Assert;
37 import org.junit.Before;
38 import org.junit.Test;
39 import org.eclipse.rdf4j.model.Resource;
40 import org.eclipse.rdf4j.model.Statement;
41 import org.eclipse.rdf4j.model.IRI;
42 import org.eclipse.rdf4j.model.Value;
43 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
44 import org.eclipse.rdf4j.repository.RepositoryConnection;
45 import org.eclipse.rdf4j.repository.RepositoryException;
46 import org.eclipse.rdf4j.repository.RepositoryResult;
47 import org.eclipse.rdf4j.repository.sail.SailRepository;
48 import org.eclipse.rdf4j.sail.Sail;
49 import org.eclipse.rdf4j.sail.SailException;
50 import org.eclipse.rdf4j.sail.memory.MemoryStore;
51 import org.slf4j.Logger;
52 import org.slf4j.LoggerFactory;
53
54 import static org.junit.Assert.assertFalse;
55 import static org.junit.Assert.assertTrue;
56
57 import java.io.ByteArrayOutputStream;
58 import java.io.FileNotFoundException;
59 import java.io.IOException;
60 import java.nio.charset.StandardCharsets;
61 import java.util.Locale;
62
63
64
65
66
67
68
69
70 public class SingleDocumentExtractionTest extends AbstractAny23TestBase {
71
72 private static final SINDICE vSINDICE = SINDICE.getInstance();
73 private static final ICAL vICAL = ICAL.getInstance();
74 private static final Review vREVIEW = Review.getInstance();
75 private static final VCard vVCARD = VCard.getInstance();
76
77 private static final Logger logger = LoggerFactory.getLogger(SingleDocumentExtractionTest.class);
78
79 private SingleDocumentExtraction singleDocumentExtraction;
80
81 private ExtractorGroup extractorGroup;
82
83 private Sail store;
84
85 private RepositoryConnection conn;
86
87 RepositoryWriter repositoryWriter;
88
89 ByteArrayOutputStream baos;
90
91 RDFXMLWriter rdfxmlWriter;
92
93 @Before
94 public void setUp() throws Exception {
95 super.setUp();
96 extractorGroup = ExtractorRegistryImpl.getInstance().getExtractorGroup();
97 store = new MemoryStore();
98 store.init();
99 conn = new SailRepository(store).getConnection();
100 }
101
102 @After
103 public void tearDown() throws SailException, RepositoryException, TripleHandlerException {
104 rdfxmlWriter.close();
105 repositoryWriter.close();
106 logger.debug(baos.toString(StandardCharsets.UTF_8));
107
108 singleDocumentExtraction = null;
109 extractorGroup = null;
110 conn.close();
111 conn = null;
112 store.shutDown();
113 store = null;
114 }
115
116
117
118
119
120
121
122
123
124
125
126 @Test
127 public void testMicroformatDomains() throws IOException, ExtractionException, RepositoryException {
128 singleDocumentExtraction = getInstance("/microformats/microformat-domains.html");
129 singleDocumentExtraction.run();
130 logStorageContent();
131 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
132 }
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148 @Test
149 public void testNestedMicroformats() throws IOException, ExtractionException, RepositoryException {
150 singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html");
151 singleDocumentExtraction.run();
152
153 logStorageContent();
154
155 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
156 assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
157 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
158 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
159 }
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175 @Test
176 public void testNestedVCardAdr() throws IOException, ExtractionException, RepositoryException {
177 singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html");
178 singleDocumentExtraction.run();
179
180 logStorageContent();
181
182 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), (Value) null, 0);
183 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null, 0);
184 }
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204 @Test
205 public void testNestedMicroformatsInduced() throws IOException, ExtractionException, RepositoryException {
206 singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html");
207 singleDocumentExtraction.run();
208
209 logStorageContent();
210
211 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
212 assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
213 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
214 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
215 }
216
217
218
219
220
221
222
223
224
225
226
227
228
229 @Test
230
231
232
233
234
235
236 public void testNestedMicroformatsManaged() throws IOException, ExtractionException, RepositoryException {
237 singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html");
238 singleDocumentExtraction.run();
239
240 logStorageContent();
241
242 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 3);
243 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING), (Value) null, 1);
244 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
245
246 assertTripleCount(vVCARD.url, (Value) null, 1);
247 Value object = getTripleObject(null, vREVIEW.hasReview);
248 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), object, 1);
249 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
250 }
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267 @Test
268 public void testTrixParserNotActivatedAfterFilterExtractorsByMIMEType()
269 throws IOException, ExtractionException, RepositoryException {
270 singleDocumentExtraction = getInstance("/html/BBC_News_Scotland.html");
271 assertTrue(singleDocumentExtraction.hasMatchingExtractors());
272 assertFalse(singleDocumentExtraction.getMatchingExtractors().stream()
273 .anyMatch(e -> TriXExtractor.class.isInstance(e)));
274 singleDocumentExtraction.run();
275 assertFalse(singleDocumentExtraction.getMatchingExtractors().stream()
276 .anyMatch(e -> TriXExtractor.class.isInstance(e)));
277
278 logStorageContent();
279 }
280
281 private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
282 baos = new ByteArrayOutputStream();
283 rdfxmlWriter = new RDFXMLWriter(baos);
284 repositoryWriter = new RepositoryWriter(conn);
285
286 final CompositeTripleHandler cth = new CompositeTripleHandler();
287 cth.addChild(rdfxmlWriter);
288 cth.addChild(repositoryWriter);
289
290 final ModifiableConfiguration configuration = DefaultConfiguration.copy();
291 configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
292 SingleDocumentExtraction instance = new SingleDocumentExtraction(configuration,
293 new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"), extractorGroup, cth);
294 instance.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()));
295 return instance;
296 }
297
298
299
300
301
302
303
304 private void logStorageContent() throws RepositoryException {
305 RepositoryResult<Statement> result = conn.getStatements(null, null, null, false);
306 while (result.hasNext()) {
307 Statement statement = result.next();
308 logger.debug(statement.toString());
309 }
310 }
311
312
313
314
315
316
317
318
319
320
321 private void assertTripleCount(IRI predicate, Value value, int occurrences) throws RepositoryException {
322 RepositoryResult<Statement> statements = conn.getStatements(null, predicate, value, false);
323 int count = 0;
324 while (statements.hasNext()) {
325 statements.next();
326 count++;
327 }
328 Assert.assertEquals(
329 String.format(Locale.ROOT, "Cannot find triple (* %s %s) %d times", predicate, value, occurrences),
330 occurrences, count);
331 }
332
333
334
335
336
337
338
339
340
341
342 private void assertTripleCount(IRI predicate, String value, int occurrences) throws RepositoryException {
343 assertTripleCount(predicate, SimpleValueFactory.getInstance().createLiteral(value), occurrences);
344 }
345
346
347
348
349
350
351
352
353
354 private void assertTriple(IRI predicate, Value value) throws RepositoryException {
355 assertTripleCount(predicate, value, 1);
356 }
357
358
359
360
361
362
363
364
365
366 @SuppressWarnings("unused")
367 private void assertTriple(IRI predicate, String value) throws RepositoryException {
368 assertTriple(predicate, SimpleValueFactory.getInstance().createLiteral(value));
369 }
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384 private Value getTripleObject(Resource sub, IRI prop) throws RepositoryException {
385 RepositoryResult<Statement> statements = conn.getStatements(sub, prop, null, false);
386 Assert.assertTrue(statements.hasNext());
387 Statement statement = statements.next();
388 Value value = statement.getObject();
389 Assert.assertFalse("Expected just one result.", statements.hasNext());
390 statements.close();
391 return value;
392 }
393
394 }