View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.microdata;
19  
20  import org.apache.any23.Any23;
21  import org.apache.any23.Any23OnlineTestBase;
22  import org.apache.any23.configuration.DefaultConfiguration;
23  import org.apache.any23.configuration.ModifiableConfiguration;
24  import org.apache.any23.extractor.ExtractionException;
25  import org.apache.any23.extractor.ExtractorFactory;
26  import org.apache.any23.extractor.IssueReport;
27  import org.apache.any23.extractor.html.AbstractExtractorTestCase;
28  import org.apache.any23.extractor.rdf.TurtleExtractorFactory;
29  import org.apache.any23.rdf.RDFUtils;
30  import org.apache.any23.source.DocumentSource;
31  import org.apache.any23.source.HTTPDocumentSource;
32  import org.apache.any23.writer.TripleWriterHandler;
33  import org.eclipse.rdf4j.model.IRI;
34  import org.eclipse.rdf4j.model.Model;
35  import org.eclipse.rdf4j.model.Value;
36  import org.eclipse.rdf4j.model.Literal;
37  import org.eclipse.rdf4j.model.Resource;
38  import org.eclipse.rdf4j.model.impl.TreeModel;
39  import org.eclipse.rdf4j.model.util.Models;
40  import org.eclipse.rdf4j.model.vocabulary.RDF;
41  import org.eclipse.rdf4j.model.vocabulary.RDFS;
42  import org.slf4j.Logger;
43  import org.slf4j.LoggerFactory;
44  import org.junit.Assert;
45  import org.junit.Test;
46  import org.eclipse.rdf4j.model.BNode;
47  import org.eclipse.rdf4j.model.Statement;
48  import org.eclipse.rdf4j.repository.RepositoryException;
49  import org.eclipse.rdf4j.rio.RDFFormat;
50  import org.eclipse.rdf4j.rio.RDFHandler;
51  import org.eclipse.rdf4j.rio.RDFHandlerException;
52  import org.eclipse.rdf4j.rio.RDFParseException;
53  import org.eclipse.rdf4j.rio.RDFParser;
54  import org.eclipse.rdf4j.rio.Rio;
55  
56  import java.io.File;
57  import java.io.FileReader;
58  import java.io.IOException;
59  import java.nio.charset.StandardCharsets;
60  import java.util.ArrayDeque;
61  import java.util.ArrayList;
62  import java.util.Arrays;
63  import java.util.Collections;
64  import java.util.HashMap;
65  import java.util.List;
66  import java.util.Map;
67  import java.util.TreeMap;
68  import java.util.concurrent.atomic.AtomicInteger;
69  
70  /**
71   * Reference test class for {@link MicrodataExtractor}.
72   *
73   * @author Davide Palmisano ( dpalmisano@gmail.com )
74   */
75  public class MicrodataExtractorTest extends AbstractExtractorTestCase {
76  
77      private static final Logger logger = LoggerFactory.getLogger(MicrodataExtractorTest.class);
78  
79      @Override
80      protected ExtractorFactory<?> getExtractorFactory() {
81          return new MicrodataExtractorFactory();
82      }
83  
84      /**
85       * Reference test for <a href="http://schema.org">Schema.org</a>.
86       *
87       * @throws ExtractionException
88       *             if an exception is raised during extraction
89       * @throws RepositoryException
90       *             if an error is encountered whilst loading content from a storage connection
91       * @throws RDFHandlerException
92       *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
93       * @throws IOException
94       *             if there is an error loading input data
95       * @throws RDFParseException
96       *             if there is an error parsing an actual RDF stream
97       */
98      @Test
99      public void testSchemaOrgNestedProps()
100             throws RepositoryException, RDFHandlerException, IOException, RDFParseException, ExtractionException {
101         extractAndVerifyAgainstNQuads("microdata-nested.html", "microdata-nested-expected.nquads");
102         logger.debug(dumpModelToNQuads());
103     }
104 
105     @Test
106     public void testUnusedItemprop() {
107         // Test for ANY23-154
108         assertExtract("/microdata/unused-itemprop.html");
109         assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Offer"));
110     }
111 
112     @Test
113     public void testExample2() {
114         // Property URI generation for hcard
115         assertExtract("/microdata/example2.html");
116         assertContains(null, RDF.TYPE, RDFUtils.iri("http://microformats.org/profile/hcard"));
117         assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#given-name"), (Value) null);
118         assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#n"), (Value) null);
119     }
120 
121     @Test
122     public void testExample5() {
123         // Vocabulary expansion for schema.org
124         assertExtract("/microdata/example5.html");
125         assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Person"));
126         assertContains(null, RDF.TYPE, RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
127         assertContains(null, RDFUtils.iri("http://schema.org/additionalType"),
128                 RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
129         assertContains(null, RDFUtils.iri("http://schema.org/email"), RDFUtils.iri("mailto:mail@gmail.com"));
130         assertContains(null, RDFUtils.iri("http://xmlns.com/foaf/0.1/mbox"), RDFUtils.iri("mailto:mail@gmail.com"));
131     }
132 
133     private static final List<String> ignoredOnlineTestNames = Arrays.asList("Test 0073", // Vocabulary Expansion test
134                                                                                           // with rdfs:subPropertyOf
135             "Test 0074" // Vocabulary Expansion test with owl:equivalentProperty
136     );
137 
138     private static Any23 createRunner(String extractorName) {
139         ModifiableConfiguration config = DefaultConfiguration.copy();
140         config.setProperty("any23.microdata.strict", DefaultConfiguration.FLAG_PROPERTY_ON);
141         Any23 runner = new Any23(config, extractorName);
142         runner.setHTTPUserAgent("apache-any23-test-user-agent");
143         return runner;
144     }
145 
146     @Test
147     public void runOnlineTests() throws Exception {
148 
149         Any23OnlineTestBase.assumeOnlineAllowed();
150 
151         Any23 ttlRunner = createRunner(TurtleExtractorFactory.NAME);
152         DocumentSource source = new HTTPDocumentSource(ttlRunner.getHTTPClient(),
153                 "https://w3c.github.io/microdata-rdf/tests/manifest.ttl");
154         HashMap<Resource, HashMap<IRI, ArrayDeque<Value>>> map = new HashMap<>(256);
155         ttlRunner.extract(source, new TripleWriterHandler() {
156             public void writeTriple(Resource s, IRI p, Value o, Resource g) {
157                 map.computeIfAbsent(s, k -> new HashMap<>()).computeIfAbsent(p, k -> new ArrayDeque<>()).add(o);
158             }
159 
160             public void writeNamespace(String prefix, String uri) {
161             }
162 
163             public void close() {
164             }
165         });
166 
167         Assert.assertFalse(map.isEmpty());
168 
169         final IRI actionPred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#action");
170         final IRI resultPred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#result");
171         final IRI namePred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#name");
172 
173         AtomicInteger passedTests = new AtomicInteger();
174         AtomicInteger ignoredTests = new AtomicInteger();
175         Map<String, String> failedTests = Collections.synchronizedMap(new TreeMap<>());
176 
177         map.values().parallelStream().forEach(item -> {
178             ArrayDeque<Value> types = item.get(RDF.TYPE);
179             if (types == null) {
180                 return;
181             }
182             boolean positive;
183             label: {
184                 for (Value type : types) {
185                     if (type.stringValue().startsWith("http://www.w3.org/ns/rdftest#TestMicrodataNegative")) {
186                         positive = false;
187                         break label;
188                     } else if (type.stringValue().startsWith("http://www.w3.org/ns/rdftest#TestMicrodata")) {
189                         positive = true;
190                         break label;
191                     }
192                 }
193                 return;
194             }
195             IRI action = (IRI) item.get(actionPred).pop();
196             IRI result = (IRI) (item.containsKey(resultPred) ? item.get(resultPred).pop() : null);
197             String name = ((Literal) item.get(namePred).pop()).getLabel();
198             if (ignoredOnlineTestNames.contains(name)) {
199                 ignoredTests.incrementAndGet();
200                 return;
201             }
202             try {
203                 name += ": " + ((Literal) item.get(RDFS.COMMENT).pop()).getLabel();
204                 TreeModel actual = new TreeModel();
205                 createRunner(MicrodataExtractorFactory.NAME).extract(action.stringValue(), new TripleWriterHandler() {
206                     public void writeTriple(Resource s, IRI p, Value o, Resource g) {
207                         if (MicrodataExtractor.MICRODATA_ITEM.equals(p))
208                             return;
209                         actual.add(s, p, o);
210                     }
211 
212                     public void writeNamespace(String prefix, String uri) {
213                     }
214 
215                     public void close() {
216                     }
217                 });
218 
219                 TreeModel expected = new TreeModel();
220                 if (result != null) {
221                     createRunner(TurtleExtractorFactory.NAME).extract(result.stringValue(), new TripleWriterHandler() {
222                         public void writeTriple(Resource s, IRI p, Value o, Resource g) {
223                             if (o instanceof IRI
224                                     && o.stringValue().equals("http://w3c.github.io/author/jd_salinger.html")) {
225                                 o = RDFUtils.iri("https://w3c.github.io/author/jd_salinger.html");
226                             }
227 
228                             expected.add(s, p, o);
229                         }
230 
231                         public void writeNamespace(String prefix, String uri) {
232                         }
233 
234                         public void close() {
235                         }
236                     });
237                 }
238 
239                 boolean testPassed = positive == Models.isomorphic(expected, actual);
240                 if (testPassed) {
241                     passedTests.incrementAndGet();
242                 } else {
243                     StringBuilder error = new StringBuilder("\n" + name + "\n");
244                     error.append(action).append(positive ? " ==> " : " =/=> ").append(result).append("\n");
245 
246                     HashMap<Value, String> m = new HashMap<>();
247                     AtomicInteger i = new AtomicInteger();
248                     int match = 0;
249                     for (Statement st : expected) {
250                         Resource s = st.getSubject();
251                         Value o = st.getObject();
252 
253                         if (actual.stream().noneMatch(t -> st.getPredicate().equals(t.getPredicate())
254                                 && (s instanceof BNode ? t.getSubject() instanceof BNode : s.equals(t.getSubject()))
255                                 && (o instanceof BNode ? t.getObject() instanceof BNode : o.equals(t.getObject())))) {
256                             if (positive) {
257                                 Object sstr = s instanceof BNode ? m.computeIfAbsent(s, k -> "_:" + i.getAndIncrement())
258                                         : s;
259                                 Object ostr = o instanceof BNode ? m.computeIfAbsent(o, k -> "_:" + i.getAndIncrement())
260                                         : o;
261                                 error.append("EXPECT: ").append(sstr).append(" ").append(st.getPredicate()).append(" ")
262                                         .append(ostr).append("\n");
263                             }
264                         } else {
265                             match++;
266                         }
267                     }
268                     error.append("...").append(match).append(" statements in common...\n");
269 
270                     for (Statement st : actual) {
271                         Resource s = st.getSubject();
272                         Value o = st.getObject();
273 
274                         if (expected.stream().noneMatch(t -> st.getPredicate().equals(t.getPredicate())
275                                 && (s instanceof BNode ? t.getSubject() instanceof BNode : s.equals(t.getSubject()))
276                                 && (o instanceof BNode ? t.getObject() instanceof BNode : o.equals(t.getObject())))) {
277                             if (positive) {
278                                 Object sstr = s instanceof BNode ? m.computeIfAbsent(s, k -> "_:" + i.getAndIncrement())
279                                         : s;
280                                 Object ostr = o instanceof BNode ? m.computeIfAbsent(o, k -> "_:" + i.getAndIncrement())
281                                         : o;
282                                 error.append("ACTUAL: ").append(sstr).append(" ").append(st.getPredicate()).append(" ")
283                                         .append(ostr).append("\n");
284                             }
285                         }
286                     }
287 
288                     failedTests.put(name, error.toString());
289                 }
290             } catch (Exception e) {
291                 failedTests.put(name, "\n" + e.toString() + "\n");
292             }
293         });
294 
295         if (logger.isDebugEnabled()) {
296             logger.debug("passed=" + passedTests.get() + "; ignored=" + ignoredTests.get());
297         }
298 
299         if (!failedTests.isEmpty()) {
300             Assert.fail(failedTests.size() + " failures out of " + (failedTests.size() + passedTests.get())
301                     + " total tests\n" + String.join("\n", failedTests.keySet()) + "\n\n"
302                     + String.join("\n", failedTests.values()));
303         }
304     }
305 
306     @Test
307     public void testMicrodataBasic() {
308         assertExtract("/microdata/microdata-basic.html");
309         assertModelNotEmpty();
310         assertStatementsSize(null, null, null, 40);
311         assertStatementsSize(RDFUtils.iri("urn:isbn:0-330-34032-8"), null, null, 4);
312     }
313 
314     @Test
315     public void testMicrodataMissingScheme() {
316         assertExtract("/microdata/microdata-missing-scheme.html");
317         assertModelNotEmpty();
318         assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer"));
319     }
320 
321     /**
322      * Reference test as provided by
323      * <a href="http://googlewebmastercentral.blogspot.com/2010/03/microdata-support-for-rich-snippets.html">Google Rich
324      * Snippet for Microdata.</a>
325      *
326      * @throws RepositoryException
327      *             if an error is encountered whilst loading content from a storage connection
328      * @throws RDFHandlerException
329      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
330      * @throws IOException
331      *             if there is an error loading input data
332      * @throws RDFParseException
333      *             if there is an error parsing an actual RDF stream
334      */
335     @Test
336     public void testMicrodataGoogleRichSnippet()
337             throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
338         extractAndVerifyAgainstNQuads("microdata-richsnippet.html", "microdata-richsnippet-expected.nquads");
339         logger.debug(dumpHumanReadableTriples());
340     }
341 
342     /**
343      * First reference test for <a href="http://www.w3.org/TR/microdata/">Microdata Extraction algorithm</a>.
344      *
345      * @throws RepositoryException
346      *             if an error is encountered whilst loading content from a storage connection
347      * @throws RDFHandlerException
348      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
349      * @throws IOException
350      *             if there is an error loading input data
351      * @throws RDFParseException
352      *             if there is an error parsing an actual RDF stream
353      */
354     @Test
355     public void testExample5221() throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
356         extractAndVerifyAgainstNQuads("5.2.1-non-normative-example-1.html",
357                 "5.2.1-non-normative-example-1-expected.nquads");
358         logger.debug(dumpHumanReadableTriples());
359     }
360 
361     /**
362      * Second reference test for <a href="http://www.w3.org/TR/microdata/">Microdata Extraction algorithm</a>.
363      *
364      * @throws RepositoryException
365      *             if an error is encountered whilst loading content from a storage connection
366      * @throws RDFHandlerException
367      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
368      * @throws IOException
369      *             if there is an error loading input data
370      * @throws RDFParseException
371      *             if there is an error parsing an actual RDF stream
372      */
373     @Test
374     public void testExample5222() throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
375         extractAndVerifyAgainstNQuads("5.2.1-non-normative-example-2.html",
376                 "5.2.1-non-normative-example-2-expected.nquads");
377         logger.debug(dumpHumanReadableTriples());
378     }
379 
380     /**
381      * First reference test for <a href="http://schema.org/">http://schema.org/</a>.
382      *
383      * @throws RepositoryException
384      *             if an error is encountered whilst loading content from a storage connection
385      * @throws RDFHandlerException
386      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
387      * @throws IOException
388      *             if there is an error loading input data
389      * @throws RDFParseException
390      *             if there is an error parsing an actual RDF stream
391      */
392     @Test
393     public void testExampleSchemaOrg1()
394             throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
395         extractAndVerifyAgainstNQuads("schemaorg-example-1.html", "schemaorg-example-1-expected.nquads");
396         logger.debug(dumpHumanReadableTriples());
397     }
398 
399     /**
400      * Second reference test for <a href="http://schema.org/">http://schema.org/</a>.
401      *
402      * @throws RepositoryException
403      *             if an error is encountered whilst loading content from a storage connection
404      * @throws RDFHandlerException
405      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
406      * @throws IOException
407      *             if there is an error loading input data
408      * @throws RDFParseException
409      *             if there is an error parsing an actual RDF stream
410      */
411     @Test
412     public void testExampleSchemaOrg2()
413             throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
414         extractAndVerifyAgainstNQuads("schemaorg-example-2.html", "schemaorg-example-2-expected.nquads");
415         logger.debug(dumpHumanReadableTriples());
416     }
417 
418     @Test
419     public void testMicrodataNestedUrlResolving() throws IOException {
420         IRI oldBaseIRI = baseIRI;
421         try {
422             baseIRI = RDFUtils.iri("https://ruben.verborgh.org/tmp/schemaorg-test.html");
423             extractAndVerifyAgainstNQuads("microdata-nested-url-resolving.html",
424                     "microdata-nested-url-resolving-expected.nquads");
425         } finally {
426             baseIRI = oldBaseIRI;
427         }
428     }
429 
430     @Test
431     public void testTel() {
432         assertExtract("/microdata/tel-test.html");
433         assertModelNotEmpty();
434         assertContains(RDFUtils.iri("http://schema.org/telephone"), RDFUtils.iri("tel:(909)%20484-2020"));
435     }
436 
437     @Test
438     public void testBadTypes() throws IOException {
439         extractAndVerifyAgainstNQuads("microdata-bad-types.html", "microdata-bad-types-expected.nquads");
440     }
441 
442     @Test
443     public void testBadPropertyNames() throws IOException {
444         extractAndVerifyAgainstNQuads("microdata-bad-properties.html", "microdata-bad-properties-expected.nquads",
445                 false);
446         assertIssue(IssueReport.IssueLevel.ERROR,
447                 ".*invalid property name ''.*\"path\" : \"/HTML\\[1\\]/BODY\\[1\\]/DIV\\[1\\]/DIV\\[2\\]/DIV\\[1\\]\".*");
448     }
449 
450     private void extractAndVerifyAgainstNQuads(String actual, String expected)
451             throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
452         extractAndVerifyAgainstNQuads(actual, expected, true);
453     }
454 
455     private void extractAndVerifyAgainstNQuads(String actual, String expected, boolean assertNoIssues)
456             throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
457         assertExtract("/microdata/" + actual, assertNoIssues);
458         assertModelNotEmpty();
459         logger.debug(dumpModelToNQuads());
460         List<Statement> expectedStatements = loadResultStatement("/microdata/" + expected);
461         int actualStmtSize = getStatementsSize(null, null, null);
462         Assert.assertEquals(expectedStatements.size(), actualStmtSize);
463         for (Statement statement : expectedStatements) {
464             assertContains(statement.getSubject() instanceof BNode ? null : statement.getSubject(),
465                     statement.getPredicate(), statement.getObject() instanceof BNode ? null : statement.getObject());
466         }
467         Model expectedModel = new TreeModel();
468         for (Statement s : expectedStatements) {
469             expectedModel.add(s.getSubject(), s.getPredicate(), s.getObject());
470         }
471 
472         Model actualModel = new TreeModel();
473         conn.export(new RDFHandler() {
474             @Override
475             public void startRDF() throws RDFHandlerException {
476             }
477 
478             @Override
479             public void endRDF() throws RDFHandlerException {
480             }
481 
482             @Override
483             public void handleNamespace(String s, String s1) throws RDFHandlerException {
484             }
485 
486             @Override
487             public void handleStatement(Statement statement) throws RDFHandlerException {
488                 actualModel.add(statement.getSubject(), statement.getPredicate(), statement.getObject());
489             }
490 
491             @Override
492             public void handleComment(String s) throws RDFHandlerException {
493             }
494         });
495 
496         Assert.assertTrue("Models are not isomorphic", Models.isomorphic(expectedModel, actualModel));
497     }
498 
499     private List<Statement> loadResultStatement(String resultFilePath)
500             throws RDFHandlerException, IOException, RDFParseException {
501         RDFParser nQuadsParser = Rio.createParser(RDFFormat.NQUADS);
502         TestRDFHandler rdfHandler = new TestRDFHandler();
503         nQuadsParser.setRDFHandler(rdfHandler);
504         File file = copyResourceToTempFile(resultFilePath);
505         nQuadsParser.parse(new FileReader(file, StandardCharsets.UTF_8), baseIRI.stringValue());
506         return rdfHandler.getStatements();
507     }
508 
509     public static class TestRDFHandler implements RDFHandler {
510 
511         private final List<Statement> statements = new ArrayList<Statement>();
512 
513         protected List<Statement> getStatements() {
514             return statements;
515         }
516 
517         public void startRDF() throws RDFHandlerException {
518         }
519 
520         public void endRDF() throws RDFHandlerException {
521         }
522 
523         public void handleNamespace(String s, String s1) throws RDFHandlerException {
524             throw new UnsupportedOperationException();
525         }
526 
527         public void handleStatement(Statement statement) throws RDFHandlerException {
528             statements.add(statement);
529         }
530 
531         public void handleComment(String s) throws RDFHandlerException {
532         }
533     }
534 
535 }