1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.microdata;
19
20 import org.apache.any23.Any23;
21 import org.apache.any23.Any23OnlineTestBase;
22 import org.apache.any23.configuration.DefaultConfiguration;
23 import org.apache.any23.configuration.ModifiableConfiguration;
24 import org.apache.any23.extractor.ExtractionException;
25 import org.apache.any23.extractor.ExtractorFactory;
26 import org.apache.any23.extractor.IssueReport;
27 import org.apache.any23.extractor.html.AbstractExtractorTestCase;
28 import org.apache.any23.extractor.rdf.TurtleExtractorFactory;
29 import org.apache.any23.rdf.RDFUtils;
30 import org.apache.any23.source.DocumentSource;
31 import org.apache.any23.source.HTTPDocumentSource;
32 import org.apache.any23.writer.TripleWriterHandler;
33 import org.eclipse.rdf4j.model.IRI;
34 import org.eclipse.rdf4j.model.Model;
35 import org.eclipse.rdf4j.model.Value;
36 import org.eclipse.rdf4j.model.Literal;
37 import org.eclipse.rdf4j.model.Resource;
38 import org.eclipse.rdf4j.model.impl.TreeModel;
39 import org.eclipse.rdf4j.model.util.Models;
40 import org.eclipse.rdf4j.model.vocabulary.RDF;
41 import org.eclipse.rdf4j.model.vocabulary.RDFS;
42 import org.slf4j.Logger;
43 import org.slf4j.LoggerFactory;
44 import org.junit.Assert;
45 import org.junit.Test;
46 import org.eclipse.rdf4j.model.BNode;
47 import org.eclipse.rdf4j.model.Statement;
48 import org.eclipse.rdf4j.repository.RepositoryException;
49 import org.eclipse.rdf4j.rio.RDFFormat;
50 import org.eclipse.rdf4j.rio.RDFHandler;
51 import org.eclipse.rdf4j.rio.RDFHandlerException;
52 import org.eclipse.rdf4j.rio.RDFParseException;
53 import org.eclipse.rdf4j.rio.RDFParser;
54 import org.eclipse.rdf4j.rio.Rio;
55
56 import java.io.File;
57 import java.io.FileReader;
58 import java.io.IOException;
59 import java.nio.charset.StandardCharsets;
60 import java.util.ArrayDeque;
61 import java.util.ArrayList;
62 import java.util.Arrays;
63 import java.util.Collections;
64 import java.util.HashMap;
65 import java.util.List;
66 import java.util.Map;
67 import java.util.TreeMap;
68 import java.util.concurrent.atomic.AtomicInteger;
69
70
71
72
73
74
75 public class MicrodataExtractorTest extends AbstractExtractorTestCase {
76
77 private static final Logger logger = LoggerFactory.getLogger(MicrodataExtractorTest.class);
78
79 @Override
80 protected ExtractorFactory<?> getExtractorFactory() {
81 return new MicrodataExtractorFactory();
82 }
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98 @Test
99 public void testSchemaOrgNestedProps()
100 throws RepositoryException, RDFHandlerException, IOException, RDFParseException, ExtractionException {
101 extractAndVerifyAgainstNQuads("microdata-nested.html", "microdata-nested-expected.nquads");
102 logger.debug(dumpModelToNQuads());
103 }
104
105 @Test
106 public void testUnusedItemprop() {
107
108 assertExtract("/microdata/unused-itemprop.html");
109 assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Offer"));
110 }
111
112 @Test
113 public void testExample2() {
114
115 assertExtract("/microdata/example2.html");
116 assertContains(null, RDF.TYPE, RDFUtils.iri("http://microformats.org/profile/hcard"));
117 assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#given-name"), (Value) null);
118 assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#n"), (Value) null);
119 }
120
121 @Test
122 public void testExample5() {
123
124 assertExtract("/microdata/example5.html");
125 assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Person"));
126 assertContains(null, RDF.TYPE, RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
127 assertContains(null, RDFUtils.iri("http://schema.org/additionalType"),
128 RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
129 assertContains(null, RDFUtils.iri("http://schema.org/email"), RDFUtils.iri("mailto:mail@gmail.com"));
130 assertContains(null, RDFUtils.iri("http://xmlns.com/foaf/0.1/mbox"), RDFUtils.iri("mailto:mail@gmail.com"));
131 }
132
133 private static final List<String> ignoredOnlineTestNames = Arrays.asList("Test 0073",
134
135 "Test 0074"
136 );
137
138 private static Any23 createRunner(String extractorName) {
139 ModifiableConfiguration config = DefaultConfiguration.copy();
140 config.setProperty("any23.microdata.strict", DefaultConfiguration.FLAG_PROPERTY_ON);
141 Any23 runner = new Any23(config, extractorName);
142 runner.setHTTPUserAgent("apache-any23-test-user-agent");
143 return runner;
144 }
145
146 @Test
147 public void runOnlineTests() throws Exception {
148
149 Any23OnlineTestBase.assumeOnlineAllowed();
150
151 Any23 ttlRunner = createRunner(TurtleExtractorFactory.NAME);
152 DocumentSource source = new HTTPDocumentSource(ttlRunner.getHTTPClient(),
153 "https://w3c.github.io/microdata-rdf/tests/manifest.ttl");
154 HashMap<Resource, HashMap<IRI, ArrayDeque<Value>>> map = new HashMap<>(256);
155 ttlRunner.extract(source, new TripleWriterHandler() {
156 public void writeTriple(Resource s, IRI p, Value o, Resource g) {
157 map.computeIfAbsent(s, k -> new HashMap<>()).computeIfAbsent(p, k -> new ArrayDeque<>()).add(o);
158 }
159
160 public void writeNamespace(String prefix, String uri) {
161 }
162
163 public void close() {
164 }
165 });
166
167 Assert.assertFalse(map.isEmpty());
168
169 final IRI actionPred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#action");
170 final IRI resultPred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#result");
171 final IRI namePred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#name");
172
173 AtomicInteger passedTests = new AtomicInteger();
174 AtomicInteger ignoredTests = new AtomicInteger();
175 Map<String, String> failedTests = Collections.synchronizedMap(new TreeMap<>());
176
177 map.values().parallelStream().forEach(item -> {
178 ArrayDeque<Value> types = item.get(RDF.TYPE);
179 if (types == null) {
180 return;
181 }
182 boolean positive;
183 label: {
184 for (Value type : types) {
185 if (type.stringValue().startsWith("http://www.w3.org/ns/rdftest#TestMicrodataNegative")) {
186 positive = false;
187 break label;
188 } else if (type.stringValue().startsWith("http://www.w3.org/ns/rdftest#TestMicrodata")) {
189 positive = true;
190 break label;
191 }
192 }
193 return;
194 }
195 IRI action = (IRI) item.get(actionPred).pop();
196 IRI result = (IRI) (item.containsKey(resultPred) ? item.get(resultPred).pop() : null);
197 String name = ((Literal) item.get(namePred).pop()).getLabel();
198 if (ignoredOnlineTestNames.contains(name)) {
199 ignoredTests.incrementAndGet();
200 return;
201 }
202 try {
203 name += ": " + ((Literal) item.get(RDFS.COMMENT).pop()).getLabel();
204 TreeModel actual = new TreeModel();
205 createRunner(MicrodataExtractorFactory.NAME).extract(action.stringValue(), new TripleWriterHandler() {
206 public void writeTriple(Resource s, IRI p, Value o, Resource g) {
207 if (MicrodataExtractor.MICRODATA_ITEM.equals(p))
208 return;
209 actual.add(s, p, o);
210 }
211
212 public void writeNamespace(String prefix, String uri) {
213 }
214
215 public void close() {
216 }
217 });
218
219 TreeModel expected = new TreeModel();
220 if (result != null) {
221 createRunner(TurtleExtractorFactory.NAME).extract(result.stringValue(), new TripleWriterHandler() {
222 public void writeTriple(Resource s, IRI p, Value o, Resource g) {
223 if (o instanceof IRI
224 && o.stringValue().equals("http://w3c.github.io/author/jd_salinger.html")) {
225 o = RDFUtils.iri("https://w3c.github.io/author/jd_salinger.html");
226 }
227
228 expected.add(s, p, o);
229 }
230
231 public void writeNamespace(String prefix, String uri) {
232 }
233
234 public void close() {
235 }
236 });
237 }
238
239 boolean testPassed = positive == Models.isomorphic(expected, actual);
240 if (testPassed) {
241 passedTests.incrementAndGet();
242 } else {
243 StringBuilder error = new StringBuilder("\n" + name + "\n");
244 error.append(action).append(positive ? " ==> " : " =/=> ").append(result).append("\n");
245
246 HashMap<Value, String> m = new HashMap<>();
247 AtomicInteger i = new AtomicInteger();
248 int match = 0;
249 for (Statement st : expected) {
250 Resource s = st.getSubject();
251 Value o = st.getObject();
252
253 if (actual.stream().noneMatch(t -> st.getPredicate().equals(t.getPredicate())
254 && (s instanceof BNode ? t.getSubject() instanceof BNode : s.equals(t.getSubject()))
255 && (o instanceof BNode ? t.getObject() instanceof BNode : o.equals(t.getObject())))) {
256 if (positive) {
257 Object sstr = s instanceof BNode ? m.computeIfAbsent(s, k -> "_:" + i.getAndIncrement())
258 : s;
259 Object ostr = o instanceof BNode ? m.computeIfAbsent(o, k -> "_:" + i.getAndIncrement())
260 : o;
261 error.append("EXPECT: ").append(sstr).append(" ").append(st.getPredicate()).append(" ")
262 .append(ostr).append("\n");
263 }
264 } else {
265 match++;
266 }
267 }
268 error.append("...").append(match).append(" statements in common...\n");
269
270 for (Statement st : actual) {
271 Resource s = st.getSubject();
272 Value o = st.getObject();
273
274 if (expected.stream().noneMatch(t -> st.getPredicate().equals(t.getPredicate())
275 && (s instanceof BNode ? t.getSubject() instanceof BNode : s.equals(t.getSubject()))
276 && (o instanceof BNode ? t.getObject() instanceof BNode : o.equals(t.getObject())))) {
277 if (positive) {
278 Object sstr = s instanceof BNode ? m.computeIfAbsent(s, k -> "_:" + i.getAndIncrement())
279 : s;
280 Object ostr = o instanceof BNode ? m.computeIfAbsent(o, k -> "_:" + i.getAndIncrement())
281 : o;
282 error.append("ACTUAL: ").append(sstr).append(" ").append(st.getPredicate()).append(" ")
283 .append(ostr).append("\n");
284 }
285 }
286 }
287
288 failedTests.put(name, error.toString());
289 }
290 } catch (Exception e) {
291 failedTests.put(name, "\n" + e.toString() + "\n");
292 }
293 });
294
295 if (logger.isDebugEnabled()) {
296 logger.debug("passed=" + passedTests.get() + "; ignored=" + ignoredTests.get());
297 }
298
299 if (!failedTests.isEmpty()) {
300 Assert.fail(failedTests.size() + " failures out of " + (failedTests.size() + passedTests.get())
301 + " total tests\n" + String.join("\n", failedTests.keySet()) + "\n\n"
302 + String.join("\n", failedTests.values()));
303 }
304 }
305
306 @Test
307 public void testMicrodataBasic() {
308 assertExtract("/microdata/microdata-basic.html");
309 assertModelNotEmpty();
310 assertStatementsSize(null, null, null, 40);
311 assertStatementsSize(RDFUtils.iri("urn:isbn:0-330-34032-8"), null, null, 4);
312 }
313
314 @Test
315 public void testMicrodataMissingScheme() {
316 assertExtract("/microdata/microdata-missing-scheme.html");
317 assertModelNotEmpty();
318 assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer"));
319 }
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335 @Test
336 public void testMicrodataGoogleRichSnippet()
337 throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
338 extractAndVerifyAgainstNQuads("microdata-richsnippet.html", "microdata-richsnippet-expected.nquads");
339 logger.debug(dumpHumanReadableTriples());
340 }
341
342
343
344
345
346
347
348
349
350
351
352
353
354 @Test
355 public void testExample5221() throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
356 extractAndVerifyAgainstNQuads("5.2.1-non-normative-example-1.html",
357 "5.2.1-non-normative-example-1-expected.nquads");
358 logger.debug(dumpHumanReadableTriples());
359 }
360
361
362
363
364
365
366
367
368
369
370
371
372
373 @Test
374 public void testExample5222() throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
375 extractAndVerifyAgainstNQuads("5.2.1-non-normative-example-2.html",
376 "5.2.1-non-normative-example-2-expected.nquads");
377 logger.debug(dumpHumanReadableTriples());
378 }
379
380
381
382
383
384
385
386
387
388
389
390
391
392 @Test
393 public void testExampleSchemaOrg1()
394 throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
395 extractAndVerifyAgainstNQuads("schemaorg-example-1.html", "schemaorg-example-1-expected.nquads");
396 logger.debug(dumpHumanReadableTriples());
397 }
398
399
400
401
402
403
404
405
406
407
408
409
410
411 @Test
412 public void testExampleSchemaOrg2()
413 throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
414 extractAndVerifyAgainstNQuads("schemaorg-example-2.html", "schemaorg-example-2-expected.nquads");
415 logger.debug(dumpHumanReadableTriples());
416 }
417
418 @Test
419 public void testMicrodataNestedUrlResolving() throws IOException {
420 IRI oldBaseIRI = baseIRI;
421 try {
422 baseIRI = RDFUtils.iri("https://ruben.verborgh.org/tmp/schemaorg-test.html");
423 extractAndVerifyAgainstNQuads("microdata-nested-url-resolving.html",
424 "microdata-nested-url-resolving-expected.nquads");
425 } finally {
426 baseIRI = oldBaseIRI;
427 }
428 }
429
430 @Test
431 public void testTel() {
432 assertExtract("/microdata/tel-test.html");
433 assertModelNotEmpty();
434 assertContains(RDFUtils.iri("http://schema.org/telephone"), RDFUtils.iri("tel:(909)%20484-2020"));
435 }
436
437 @Test
438 public void testBadTypes() throws IOException {
439 extractAndVerifyAgainstNQuads("microdata-bad-types.html", "microdata-bad-types-expected.nquads");
440 }
441
442 @Test
443 public void testBadPropertyNames() throws IOException {
444 extractAndVerifyAgainstNQuads("microdata-bad-properties.html", "microdata-bad-properties-expected.nquads",
445 false);
446 assertIssue(IssueReport.IssueLevel.ERROR,
447 ".*invalid property name ''.*\"path\" : \"/HTML\\[1\\]/BODY\\[1\\]/DIV\\[1\\]/DIV\\[2\\]/DIV\\[1\\]\".*");
448 }
449
450 private void extractAndVerifyAgainstNQuads(String actual, String expected)
451 throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
452 extractAndVerifyAgainstNQuads(actual, expected, true);
453 }
454
455 private void extractAndVerifyAgainstNQuads(String actual, String expected, boolean assertNoIssues)
456 throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
457 assertExtract("/microdata/" + actual, assertNoIssues);
458 assertModelNotEmpty();
459 logger.debug(dumpModelToNQuads());
460 List<Statement> expectedStatements = loadResultStatement("/microdata/" + expected);
461 int actualStmtSize = getStatementsSize(null, null, null);
462 Assert.assertEquals(expectedStatements.size(), actualStmtSize);
463 for (Statement statement : expectedStatements) {
464 assertContains(statement.getSubject() instanceof BNode ? null : statement.getSubject(),
465 statement.getPredicate(), statement.getObject() instanceof BNode ? null : statement.getObject());
466 }
467 Model expectedModel = new TreeModel();
468 for (Statement s : expectedStatements) {
469 expectedModel.add(s.getSubject(), s.getPredicate(), s.getObject());
470 }
471
472 Model actualModel = new TreeModel();
473 conn.export(new RDFHandler() {
474 @Override
475 public void startRDF() throws RDFHandlerException {
476 }
477
478 @Override
479 public void endRDF() throws RDFHandlerException {
480 }
481
482 @Override
483 public void handleNamespace(String s, String s1) throws RDFHandlerException {
484 }
485
486 @Override
487 public void handleStatement(Statement statement) throws RDFHandlerException {
488 actualModel.add(statement.getSubject(), statement.getPredicate(), statement.getObject());
489 }
490
491 @Override
492 public void handleComment(String s) throws RDFHandlerException {
493 }
494 });
495
496 Assert.assertTrue("Models are not isomorphic", Models.isomorphic(expectedModel, actualModel));
497 }
498
499 private List<Statement> loadResultStatement(String resultFilePath)
500 throws RDFHandlerException, IOException, RDFParseException {
501 RDFParser nQuadsParser = Rio.createParser(RDFFormat.NQUADS);
502 TestRDFHandler rdfHandler = new TestRDFHandler();
503 nQuadsParser.setRDFHandler(rdfHandler);
504 File file = copyResourceToTempFile(resultFilePath);
505 nQuadsParser.parse(new FileReader(file, StandardCharsets.UTF_8), baseIRI.stringValue());
506 return rdfHandler.getStatements();
507 }
508
509 public static class TestRDFHandler implements RDFHandler {
510
511 private final List<Statement> statements = new ArrayList<Statement>();
512
513 protected List<Statement> getStatements() {
514 return statements;
515 }
516
517 public void startRDF() throws RDFHandlerException {
518 }
519
520 public void endRDF() throws RDFHandlerException {
521 }
522
523 public void handleNamespace(String s, String s1) throws RDFHandlerException {
524 throw new UnsupportedOperationException();
525 }
526
527 public void handleStatement(Statement statement) throws RDFHandlerException {
528 statements.add(statement);
529 }
530
531 public void handleComment(String s) throws RDFHandlerException {
532 }
533 }
534
535 }