View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.AbstractAny23TestBase;
21  import org.apache.any23.configuration.DefaultConfiguration;
22  import org.apache.any23.configuration.ModifiableConfiguration;
23  import org.apache.any23.extractor.html.HTMLFixture;
24  import org.apache.any23.extractor.rdf.TriXExtractor;
25  import org.apache.any23.mime.TikaMIMETypeDetector;
26  import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
27  import org.apache.any23.vocab.ICAL;
28  import org.apache.any23.vocab.Review;
29  import org.apache.any23.vocab.SINDICE;
30  import org.apache.any23.vocab.VCard;
31  import org.apache.any23.writer.CompositeTripleHandler;
32  import org.apache.any23.writer.RDFXMLWriter;
33  import org.apache.any23.writer.RepositoryWriter;
34  import org.apache.any23.writer.TripleHandlerException;
35  import org.junit.After;
36  import org.junit.Assert;
37  import org.junit.Before;
38  import org.junit.Test;
39  import org.eclipse.rdf4j.model.Resource;
40  import org.eclipse.rdf4j.model.Statement;
41  import org.eclipse.rdf4j.model.IRI;
42  import org.eclipse.rdf4j.model.Value;
43  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
44  import org.eclipse.rdf4j.repository.RepositoryConnection;
45  import org.eclipse.rdf4j.repository.RepositoryException;
46  import org.eclipse.rdf4j.repository.RepositoryResult;
47  import org.eclipse.rdf4j.repository.sail.SailRepository;
48  import org.eclipse.rdf4j.sail.Sail;
49  import org.eclipse.rdf4j.sail.SailException;
50  import org.eclipse.rdf4j.sail.memory.MemoryStore;
51  import org.slf4j.Logger;
52  import org.slf4j.LoggerFactory;
53  
54  import static org.junit.Assert.assertFalse;
55  import static org.junit.Assert.assertTrue;
56  
57  import java.io.ByteArrayOutputStream;
58  import java.io.FileNotFoundException;
59  import java.io.IOException;
60  import java.nio.charset.StandardCharsets;
61  import java.util.Locale;
62  
63  /**
64   * Test case for {@link SingleDocumentExtraction}.
65   *
66   * @author Michele Mostarda (mostarda@fbk.eu)
67   * @author Davide Palmisano (palmisano@fbk.eu)
68   */
69  // TODO #20 - Solve issue that hreview item and vcard item have the same BNode due they have the same XPath DOM.
70  public class SingleDocumentExtractionTest extends AbstractAny23TestBase {
71  
72      private static final SINDICE vSINDICE = SINDICE.getInstance();
73      private static final ICAL vICAL = ICAL.getInstance();
74      private static final Review vREVIEW = Review.getInstance();
75      private static final VCard vVCARD = VCard.getInstance();
76  
77      private static final Logger logger = LoggerFactory.getLogger(SingleDocumentExtractionTest.class);
78  
79      private SingleDocumentExtraction singleDocumentExtraction;
80  
81      private ExtractorGroup extractorGroup;
82  
83      private Sail store;
84  
85      private RepositoryConnection conn;
86  
87      RepositoryWriter repositoryWriter;
88  
89      ByteArrayOutputStream baos;
90  
91      RDFXMLWriter rdfxmlWriter;
92  
93      @Before
94      public void setUp() throws Exception {
95          super.setUp();
96          extractorGroup = ExtractorRegistryImpl.getInstance().getExtractorGroup();
97          store = new MemoryStore();
98          store.init();
99          conn = new SailRepository(store).getConnection();
100     }
101 
102     @After
103     public void tearDown() throws SailException, RepositoryException, TripleHandlerException {
104         rdfxmlWriter.close();
105         repositoryWriter.close();
106         logger.debug(baos.toString(StandardCharsets.UTF_8));
107 
108         singleDocumentExtraction = null;
109         extractorGroup = null;
110         conn.close();
111         conn = null;
112         store.shutDown();
113         store = null;
114     }
115 
116     /**
117      * Tests the existence of the domain triples.
118      *
119      * @throws IOException
120      *             if there is an error loading input data
121      * @throws ExtractionException
122      *             if an exception is raised during extraction
123      * @throws RepositoryException
124      *             if an error is encountered whilst loading content from a storage connection
125      */
126     @Test
127     public void testMicroformatDomains() throws IOException, ExtractionException, RepositoryException {
128         singleDocumentExtraction = getInstance("/microformats/microformat-domains.html");
129         singleDocumentExtraction.run();
130         logStorageContent();
131         assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
132     }
133 
134     /**
135      * Tests the nested microformat relationships. This test verifies the first supported approach for microformat
136      * nesting. Such approach foreseen to add a microformat HTML node within the property of a container microformat.
137      *
138      * For further details see {@link SingleDocumentExtraction} consolidateResources(java.util.List, java.util.List,
139      * org.apache.any23.writer.TripleHandler)}
140      *
141      * @throws IOException
142      *             if there is an error loading input data
143      * @throws ExtractionException
144      *             if an exception is raised during extraction
145      * @throws RepositoryException
146      *             if an error is encountered whilst loading content from a storage connection
147      */
148     @Test
149     public void testNestedMicroformats() throws IOException, ExtractionException, RepositoryException {
150         singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html");
151         singleDocumentExtraction.run();
152 
153         logStorageContent();
154 
155         assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
156         assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
157         assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
158         assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
159     }
160 
161     /**
162      * This test assess the absence of {@link SINDICE} <i>nesting</i> relationship, since
163      * {@link org.apache.any23.extractor.html.HCardExtractor} declared a native nesting with the
164      * {@link org.apache.any23.extractor.html.AdrExtractor}.
165      *
166      * @see org.apache.any23.extractor.html.annotations.Includes
167      * 
168      * @throws IOException
169      *             if there is an error loading input data
170      * @throws ExtractionException
171      *             if an exception is raised during extraction
172      * @throws RepositoryException
173      *             if an error is encountered whilst loading content from a storage connection
174      */
175     @Test
176     public void testNestedVCardAdr() throws IOException, ExtractionException, RepositoryException {
177         singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html");
178         singleDocumentExtraction.run();
179 
180         logStorageContent();
181 
182         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), (Value) null, 0);
183         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null, 0);
184     }
185 
186     /**
187      * Tests the nested microformat relationships. This test verifies the second supported approach for microformat
188      * nesting. Such approach foreseen to use the same node attributes to declare both a microformat container property
189      * and a nested microformat root class.
190      *
191      * For further details see {@link SingleDocumentExtraction} consolidateResources(java.util.List, java.util.List,
192      * org.apache.any23.writer.TripleHandler)}
193      *
194      * See also the <a href="http://www.google.com/support/webmasters/bin/answer.py?answer=146862">Nested Entities</a>
195      * article that is linked by the official microformats.org doc page.
196      *
197      * @throws IOException
198      *             if there is an error loading input data
199      * @throws ExtractionException
200      *             if an exception is raised during extraction
201      * @throws RepositoryException
202      *             if an error is encountered whilst loading content from a storage connection
203      */
204     @Test
205     public void testNestedMicroformatsInduced() throws IOException, ExtractionException, RepositoryException {
206         singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html");
207         singleDocumentExtraction.run();
208 
209         logStorageContent();
210 
211         assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
212         assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
213         assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
214         assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
215     }
216 
217     /**
218      * Tests the nested microformat relationships. This test verifies the behavior of the nested microformats when the
219      * nesting relationship is handled by the microformat extractor itself (like the HReview that is able to detect an
220      * inner VCard).
221      *
222      * @throws IOException
223      *             if there is an error loading input data
224      * @throws ExtractionException
225      *             if an exception is raised during extraction
226      * @throws RepositoryException
227      *             if an error is encountered whilst loading content from a storage connection
228      */
229     @Test
230     /*
231      * NOTE: The triple (bnode http://www.w3.org/2006/vcard/ns#url http://pizza.example.com) and (bnode
232      * http://vocab.sindice.net/nesting_original (structured) *) are printed out twice, once for every extractor. The
233      * RDFWriter doesn't remove the duplicates and some graph renderers show the triple property as double. Despite this
234      * the model contains it just once.
235      */
236     public void testNestedMicroformatsManaged() throws IOException, ExtractionException, RepositoryException {
237         singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html");
238         singleDocumentExtraction.run();
239 
240         logStorageContent();
241 
242         assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 3);
243         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING), (Value) null, 1);
244         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
245 
246         assertTripleCount(vVCARD.url, (Value) null, 1);
247         Value object = getTripleObject(null, vREVIEW.hasReview);
248         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), object, 1);
249         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
250     }
251 
252     /**
253      * Tests that the {@link org.apache.any23.extractor.rdf.TriXExtractor} is NOT activated for a given HTML document.
254      * This tests that a private method within {@link org.apache.any23.extractor.SingleDocumentExtraction} works as
255      * expected.
256      * 
257      * @see <a href=
258      *      "https://issues.apache.org/jira/browse/ANY23-504">https://issues.apache.org/jira/browse/ANY23-504</a>
259      *
260      * @throws IOException
261      *             if there is an error loading input data
262      * @throws ExtractionException
263      *             if an exception is raised during extraction
264      * @throws RepositoryException
265      *             if an error is encountered whilst loading content from a storage connection
266      */
267     @Test
268     public void testTrixParserNotActivatedAfterFilterExtractorsByMIMEType()
269             throws IOException, ExtractionException, RepositoryException {
270         singleDocumentExtraction = getInstance("/html/BBC_News_Scotland.html");
271         assertTrue(singleDocumentExtraction.hasMatchingExtractors());
272         assertFalse(singleDocumentExtraction.getMatchingExtractors().stream()
273                 .anyMatch(e -> TriXExtractor.class.isInstance(e)));
274         singleDocumentExtraction.run();
275         assertFalse(singleDocumentExtraction.getMatchingExtractors().stream()
276                 .anyMatch(e -> TriXExtractor.class.isInstance(e)));
277 
278         logStorageContent();
279     }
280 
281     private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
282         baos = new ByteArrayOutputStream();
283         rdfxmlWriter = new RDFXMLWriter(baos);
284         repositoryWriter = new RepositoryWriter(conn);
285 
286         final CompositeTripleHandler cth = new CompositeTripleHandler();
287         cth.addChild(rdfxmlWriter);
288         cth.addChild(repositoryWriter);
289 
290         final ModifiableConfiguration configuration = DefaultConfiguration.copy();
291         configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
292         SingleDocumentExtraction instance = new SingleDocumentExtraction(configuration,
293                 new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"), extractorGroup, cth);
294         instance.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()));
295         return instance;
296     }
297 
298     /**
299      * Logs the storage content.
300      * 
301      * @throws RepositoryException
302      *             if an error is encountered whilst loading content from a storage connection
303      */
304     private void logStorageContent() throws RepositoryException {
305         RepositoryResult<Statement> result = conn.getStatements(null, null, null, false);
306         while (result.hasNext()) {
307             Statement statement = result.next();
308             logger.debug(statement.toString());
309         }
310     }
311 
312     /**
313      * Asserts that the triple pattern is present within the storage exactly n times.
314      * 
315      * @param predicate
316      * @param value
317      * @param occurrences
318      * 
319      * @throws RepositoryException
320      */
321     private void assertTripleCount(IRI predicate, Value value, int occurrences) throws RepositoryException {
322         RepositoryResult<Statement> statements = conn.getStatements(null, predicate, value, false);
323         int count = 0;
324         while (statements.hasNext()) {
325             statements.next();
326             count++;
327         }
328         Assert.assertEquals(
329                 String.format(Locale.ROOT, "Cannot find triple (* %s %s) %d times", predicate, value, occurrences),
330                 occurrences, count);
331     }
332 
333     /**
334      * Asserts that the triple pattern is present within the storage exactly n times.
335      *
336      * @param predicate
337      * @param value
338      * @param occurrences
339      * 
340      * @throws RepositoryException
341      */
342     private void assertTripleCount(IRI predicate, String value, int occurrences) throws RepositoryException {
343         assertTripleCount(predicate, SimpleValueFactory.getInstance().createLiteral(value), occurrences);
344     }
345 
346     /**
347      * Asserts that a triple exists exactly once.
348      *
349      * @param predicate
350      * @param value
351      * 
352      * @throws RepositoryException
353      */
354     private void assertTriple(IRI predicate, Value value) throws RepositoryException {
355         assertTripleCount(predicate, value, 1);
356     }
357 
358     /**
359      * Asserts that a triple exists exactly once.
360      *
361      * @param predicate
362      * @param value
363      * 
364      * @throws RepositoryException
365      */
366     @SuppressWarnings("unused")
367     private void assertTriple(IRI predicate, String value) throws RepositoryException {
368         assertTriple(predicate, SimpleValueFactory.getInstance().createLiteral(value));
369     }
370 
371     /**
372      * Retrieves the triple object matching with the given pattern that is expected to be just one.
373      * 
374      * @param sub
375      *            the triple subject, <code>null</code> for any.
376      * @param prop
377      *            the triple property, <code>null</code> for any.
378      * 
379      * @return the object of the unique triple matching the given pattern.
380      * 
381      * @throws RepositoryException
382      *             if an error occurred during the search.
383      */
384     private Value getTripleObject(Resource sub, IRI prop) throws RepositoryException {
385         RepositoryResult<Statement> statements = conn.getStatements(sub, prop, null, false);
386         Assert.assertTrue(statements.hasNext());
387         Statement statement = statements.next();
388         Value value = statement.getObject();
389         Assert.assertFalse("Expected just one result.", statements.hasNext());
390         statements.close();
391         return value;
392     }
393 
394 }