View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23;
19  
20  import org.apache.any23.extractor.ExtractorGroup;
21  import org.apache.any23.extractor.rdf.NTriplesExtractorFactory;
22  import org.apache.http.conn.ConnectTimeoutException;
23  import org.junit.Assert;
24  import org.apache.any23.configuration.Configuration;
25  import org.apache.any23.configuration.DefaultConfiguration;
26  import org.apache.any23.configuration.ModifiableConfiguration;
27  import org.apache.any23.extractor.ExtractionException;
28  import org.apache.any23.extractor.ExtractionParameters;
29  import org.apache.any23.extractor.Extractor;
30  import org.apache.any23.extractor.microdata.MicrodataExtractor;
31  import org.apache.any23.filter.IgnoreAccidentalRDFa;
32  import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
33  import org.apache.any23.http.DefaultHTTPClient;
34  import org.apache.any23.http.DefaultHTTPClientConfiguration;
35  import org.apache.any23.http.HTTPClient;
36  import org.apache.any23.http.HTTPClientConfiguration;
37  import org.apache.any23.source.DocumentSource;
38  import org.apache.any23.source.HTTPDocumentSource;
39  import org.apache.any23.source.StringDocumentSource;
40  import org.apache.any23.util.FileUtils;
41  import org.apache.any23.util.StreamUtils;
42  import org.apache.any23.util.StringUtils;
43  import org.apache.any23.vocab.DCTerms;
44  import org.apache.any23.writer.CompositeTripleHandler;
45  import org.apache.any23.writer.CountingTripleHandler;
46  import org.apache.any23.writer.NTriplesWriter;
47  import org.apache.any23.writer.RDFXMLWriter;
48  import org.apache.any23.writer.ReportingTripleHandler;
49  import org.apache.any23.writer.RepositoryWriter;
50  import org.apache.any23.writer.TripleHandler;
51  import org.apache.any23.writer.TripleHandlerException;
52  import org.apache.commons.io.IOUtils;
53  import org.junit.AssumptionViolatedException;
54  import org.junit.Test;
55  import org.eclipse.rdf4j.model.Statement;
56  import org.eclipse.rdf4j.repository.Repository;
57  import org.eclipse.rdf4j.repository.RepositoryConnection;
58  import org.eclipse.rdf4j.repository.RepositoryResult;
59  import org.eclipse.rdf4j.repository.sail.SailRepository;
60  import org.eclipse.rdf4j.rio.RDFParseException;
61  import org.eclipse.rdf4j.sail.memory.MemoryStore;
62  import org.slf4j.Logger;
63  import org.slf4j.LoggerFactory;
64  
65  import java.io.ByteArrayOutputStream;
66  import java.io.IOException;
67  import java.net.URISyntaxException;
68  import java.nio.charset.StandardCharsets;
69  import java.util.Collections;
70  import java.util.List;
71  import java.util.Locale;
72  
73  import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
74  
75  /**
76   * Test case for {@link Any23} facade.
77   * 
78   * @author Davide Palmisano ( dpalmisano@gmail.com )
79   * @author Michele Mostarda ( michele.mostarda@gmail.com )
80   */
81  @SuppressWarnings("unchecked")
82  public class Any23Test extends Any23OnlineTestBase {
83  
84      private static final DCTerms vDCTERMS = DCTerms.getInstance();
85  
86      private static final String PAGE_URL = "http://bob.com";
87  
88      private static final Logger logger = LoggerFactory.getLogger(Any23Test.class);
89  
90      @Test
91      public void testTTLDetection() throws Exception {
92          assertDetection("<a> <b> <c> .", "rdf-turtle");
93      }
94  
95      @Test
96      public void testN3Detection1() throws Exception {
97          assertDetection("<Bob><brothers>(<Jim><Mark>).", "rdf-turtle");
98      }
99  
100     @Test
101     public void testN3Detection2() throws Exception {
102         assertDetection("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .", "rdf-nt");
103     }
104 
105     @Test
106     public void testHTMLBruteForceDetection() throws Exception {
107         assertDetection("<html><body><div class=\"vcard fn\">Joe</div></body></html>");
108     }
109 
110     /**
111      * This tests the behavior of <i>Any23</i> to execute the extraction explicitly specifying the charset encoding of
112      * the input.
113      * 
114      * @throws Exception
115      *             if there is an error reading the input
116      */
117     @Test
118     public void testExplicitEncoding() throws Exception {
119         assertEncodingDetection("UTF-8", "/html/encoding-test.html", "Knud M\u00F6ller");
120     }
121 
122     /**
123      * This tests the behavior of <i>Any23</i> to perform the extraction without passing it any charset encoding. The
124      * encoding is therefore guessed using {@link org.apache.any23.encoding.TikaEncodingDetector} class.
125      * 
126      * @throws Exception
127      *             if there is an error reading the input
128      */
129     @Test
130     public void testImplicitEncoding() throws Exception {
131         assertEncodingDetection(null, // The encoding will be auto detected.
132                 "/html/encoding-test.html", "Knud M\u00F6ller");
133     }
134 
135     @Test
136     public void testRDFXMLDetectionAndExtraction() throws Exception {
137         String rdfXML = "<?xml version='1.0'?> " + "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' "
138                 + "xmlns:dc='http://purl.org/dc/elements/1.1/'>"
139                 + "<rdf:Description rdf:about='http://www.example.com'>" + "<dc:title>x</dc:title>"
140                 + "</rdf:Description>" + "</rdf:RDF>";
141         assertDetectionAndExtraction(rdfXML);
142     }
143 
144     @Test
145     public void testNTriplesDetectionAndExtraction() throws Exception {
146         String n3 = "<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"n3 . appo\" .";
147         assertDetectionAndExtraction(n3);
148     }
149 
150     @Test
151     public void testNturtleDetectionAndExtraction() throws Exception {
152         String nTurtle = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n"
153                 + "@prefix dc: <http://purl.org/dc/elements/1.1/> .\n"
154                 + "@prefix ex: <http://example.org/stuff/1.0/> .\n" + "\n"
155                 + "<http://www.w3.org/TR/rdf-syntax-grammar>\n"
156                 + "  dc:title \"RDF/XML Syntax Specification (Revised)\" ;\n" + "  ex:editor [\n"
157                 + "    ex:fullname \"Dave Beckett\";\n" + "    ex:homePage <http://purl.org/net/dajobe/>\n" + "  ] .";
158         assertDetectionAndExtraction(nTurtle);
159     }
160 
161     /**
162      * Tests out the first code snipped used in <i>Developer Manual</i>.
163      * 
164      * @throws Exception
165      *             if there is an error reading the input
166      */
167     @Test
168     public void testDemoCodeSnippet1() throws Exception {
169         /* 1 */Any23 runner = new Any23();
170         /* 2 */final String content = "@prefix foo: <http://example.org/ns#> .   "
171                 + "@prefix : <http://other.example.org/ns#> ." + "foo:bar foo: : .                          "
172                 + ":bar : foo:bar .                           ";
173         // The second argument of StringDocumentSource() must be a valid IRI.
174         /* 3 */DocumentSource source = new StringDocumentSource(content, "http://host.com/service");
175         /* 4 */ByteArrayOutputStream out = new ByteArrayOutputStream();
176         /* 5 */TripleHandler handler = new NTriplesWriter(out);
177         try {
178             /* 6 */runner.extract(source, handler);
179         } finally {
180             /* 7 */handler.close();
181         }
182         /* 8 */String nt = out.toString("UTF-8");
183 
184         /*
185          * <http://example.org/ns#bar> <http://example.org/ns#> <http://other.example.org/ns#> .
186          * <http://other.example.org/ns#bar> <http://other.example.org/ns#> <http://example.org/ns#bar> .
187          */
188         logger.debug("nt: " + nt);
189         Assert.assertTrue(nt.length() > 0);
190     }
191 
192     /**
193      * Tests out the second code snipped used in <i>Developer Manual</i>.
194      * 
195      * @throws Exception
196      *             if there is an error reading the input
197      */
198     @Test
199     public void testDemoCodeSnippet2() throws Exception {
200         assumeOnlineAllowed();
201 
202         Any23 runner = new Any23();
203         runner.setHTTPUserAgent("apache-any23-test-user-agent");
204         HTTPClient httpClient = runner.getHTTPClient();
205         DocumentSource source = new HTTPDocumentSource(httpClient, "http://dbpedia.org/resource/Trento");
206         ByteArrayOutputStream out = new ByteArrayOutputStream();
207         TripleHandler handler = new NTriplesWriter(out);
208         try {
209             runner.extract(source, handler);
210         } finally {
211             handler.close();
212         }
213         String n3 = out.toString("UTF-8");
214 
215         /*
216          * <http://dbpedia.org/resource/Trent> <http://dbpedia.org/ontology/wikiPageDisambiguates>
217          * <http://dbpedia.org/resource/Trento> . <http://dbpedia.org/resource/Andrea_Pozzo>
218          * <http://dbpedia.org/ontology/birthPlace> <http://dbpedia.org/resource/Trento> .
219          * <http://dbpedia.org/resource/Union_for_Trentino> <http://dbpedia.org/ontology/headquarter>
220          * <http://dbpedia.org/resource/Trento> . [...]
221          */
222         logger.debug("n3: " + n3);
223         Assert.assertTrue(n3.length() > 0);
224 
225         Assert.assertTrue(n3.contains(
226                 "<http://dbpedia.org/resource/Trento> <http://dbpedia.org/property/mayor> \"Franco Ianeselli, elected 2020\"@en ."));
227     }
228 
229     /**
230      * This test checks the extraction behavior when the library is used programatically. This test is related to the
231      * issue #45, to verify the different behaviors between Maven and Ant. The behavior was related to a 2nd-level
232      * dependency introduced by Maven.
233      * 
234      * @throws org.apache.any23.extractor.ExtractionException
235      *             if there is an error running extraction logic
236      * @throws IOException
237      *             if there is an error reading the input
238      * @throws URISyntaxException
239      *             if there is an error defining input URI's
240      */
241     @Test
242     public void testProgrammaticExtraction() throws ExtractionException, IOException, URISyntaxException {
243         Any23 any23 = new Any23();
244         any23.setHTTPUserAgent("Any23-Servlet");
245         any23.setHTTPClient(new DefaultHTTPClient() {
246             @Override
247             protected int getConnectionTimeout() {
248                 return 5000;
249             }
250 
251             @Override
252             protected int getSoTimeout() {
253                 return 2000;
254             }
255         });
256         ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
257         TripleHandler handler = new NTriplesWriter(byteArrayOutputStream);
258         TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler);
259         ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter);
260 
261         DocumentSource source = getDocumentSourceFromResource("/html/rdfa/ansa_2010-02-26_12645863.html",
262                 "http://host.com/service");
263 
264         Assert.assertTrue(any23.extract(source, reporting).hasMatchingExtractors());
265         try {
266             handler.close();
267         } catch (TripleHandlerException e) {
268             Assert.fail(e.getMessage());
269         }
270 
271         final String bufferContent = byteArrayOutputStream.toString(StandardCharsets.UTF_8);
272         logger.debug(bufferContent);
273         Assert.assertSame("Unexpected number of triples.", 18, StringUtils.countNL(bufferContent));
274 
275     }
276 
277     /**
278      * This test checks if a URL that is supposed to be GZIPPED is correctly opened and parsed with the {@link Any23}
279      * facade.
280      * 
281      * @throws org.apache.any23.extractor.ExtractionException
282      *             if there is an error running extraction logic
283      * @throws IOException
284      *             if there is an error reading the input
285      * @throws URISyntaxException
286      *             if there is an error defining input URI's
287      */
288     @Test
289     public void testGZippedContent() throws IOException, URISyntaxException, ExtractionException {
290         assumeOnlineAllowed();
291         final Any23 runner = new Any23();
292         runner.setHTTPUserAgent("apache-any23-test-user-agent");
293         DocumentSource source = new HTTPDocumentSource(runner.getHTTPClient(), "https://dev.w3.org/html5/rdfa/");
294         ByteArrayOutputStream out = new ByteArrayOutputStream();
295         TripleHandler handler = new NTriplesWriter(out);
296         try {
297             runner.extract(source, handler);
298         } catch (ConnectTimeoutException e) {
299             // This page is down as of 2019.09.14
300             logger.error("Connection to " + source.getDocumentIRI() + " timed out; skipping test", e);
301             throw new AssumptionViolatedException(e.getMessage());
302         }
303         String n3 = out.toString(StandardCharsets.UTF_8);
304         logger.debug("N3 " + n3);
305         Assert.assertTrue(n3.length() > 0);
306     }
307 
308     @Test
309     public void testExtractionParameters() throws IOException, ExtractionException, TripleHandlerException {
310         // not quite sure if following triples should be extracted
311         // ?doc <http://www.w3.org/1999/xhtml/vocab#icon> <https://any23.googlecode.com/favicon.ico> .
312         // ?doc <http://www.w3.org/1999/xhtml/vocab#stylesheet> <https://any23.googlecode.com/design/style.css> .
313 
314         final int EXPECTED_TRIPLES = 12;
315         Any23 runner = new Any23();
316         DocumentSource source = getDocumentSourceFromResource("/org/apache/any23/validator/missing-og-namespace.html",
317                 "http://www.test.com");
318 
319         ByteArrayOutputStream baos = new ByteArrayOutputStream();
320 
321         CountingTripleHandler cth1 = new CountingTripleHandler();
322         NTriplesWriter ctw1 = new NTriplesWriter(baos);
323         CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
324         compositeTH1.addChild(cth1);
325         compositeTH1.addChild(ctw1);
326         try {
327             runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.NONE), source,
328                     compositeTH1);
329         } finally {
330             compositeTH1.close();
331         }
332         logger.debug(baos.toString(StandardCharsets.UTF_8));
333         Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth1.getCount());
334     }
335 
336     @Test
337     public void testExtractionParametersWithNestingDisabled()
338             throws IOException, ExtractionException, TripleHandlerException {
339         final int EXPECTED_TRIPLES = 20;
340         Any23 runner = new Any23();
341         DocumentSource source = getDocumentSourceFromResource("/microformats/nested-microformats-a1.html",
342                 "http://www.test.com");
343 
344         ByteArrayOutputStream baos = new ByteArrayOutputStream();
345 
346         CountingTripleHandler cth1 = new CountingTripleHandler();
347         RDFXMLWriter ctw1 = new RDFXMLWriter(baos);
348         CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
349         compositeTH1.addChild(cth1);
350         compositeTH1.addChild(ctw1);
351         runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.NONE, true), source,
352                 compositeTH1);
353         compositeTH1.close();
354         logger.debug("Out1: " + baos.toString(StandardCharsets.UTF_8));
355         Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES + 3, cth1.getCount());
356 
357         baos.reset();
358         CountingTripleHandler cth2 = new CountingTripleHandler();
359         NTriplesWriter ctw2 = new NTriplesWriter(baos);
360         CompositeTripleHandler compositeTH2 = new CompositeTripleHandler();
361         compositeTH2.addChild(cth2);
362         compositeTH2.addChild(ctw2);
363         runner.extract(
364                 new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.VALIDATE_AND_FIX, false),
365                 source, compositeTH2);
366         compositeTH2.close();
367         logger.debug("Out2: " + baos.toString(StandardCharsets.UTF_8));
368         Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth2.getCount());
369     }
370 
371     @Test
372     public void testExceptionPropagation() throws IOException {
373         Any23 any23 = new Any23();
374         DocumentSource source = getDocumentSourceFromResource("/application/turtle/geolinkeddata.ttl",
375                 "http://www.test.com");
376         CountingTripleHandler cth1 = new CountingTripleHandler();
377         try {
378             any23.extract(source, cth1);
379         } catch (ExtractionException e) {
380             Assert.assertTrue(e.getCause() instanceof RDFParseException);
381         }
382 
383     }
384 
385     /**
386      * Test correct management of general <i>XML</i> content.
387      * 
388      * @throws org.apache.any23.extractor.ExtractionException
389      *             if there is an error running extraction logic
390      * @throws IOException
391      *             if there is an error reading the input
392      */
393     @Test
394     public void testXMLMimeTypeManagement() throws IOException, ExtractionException {
395         final String documentIRI = "http://www.test.com/resource.xml";
396         final String contentType = "application/xml";
397         final String in = StreamUtils.asString(this.getClass().getResourceAsStream("any23-xml-mimetype.xml"));
398         final DocumentSource doc = new StringDocumentSource(in, documentIRI, contentType);
399         final Any23 any23 = new Any23();
400         final CountingTripleHandler cth = new CountingTripleHandler(false);
401         final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
402         final ExtractionReport report = any23.extract(doc, rth);
403         Assert.assertFalse(report.hasMatchingExtractors());
404         Assert.assertEquals(0, cth.getCount());
405     }
406 
407     /**
408      * Test correct management of general <i>XML</i> content from <i>URL</i> source.
409      * 
410      * @throws org.apache.any23.extractor.ExtractionException
411      *             if there is an error running extraction logic
412      * @throws IOException
413      *             if there is an error reading the input
414      */
415     @Test
416     public void testXMLMimeTypeManagementViaURL() throws IOException, ExtractionException {
417         assumeOnlineAllowed();
418         final Any23 any23 = new Any23();
419         any23.setHTTPUserAgent("apache-any23-test-user-agent");
420         HTTPClient client = any23.getHTTPClient();
421         HTTPClientConfiguration configuration = new DefaultHTTPClientConfiguration("application/xml");
422         client.init(configuration);
423         final CountingTripleHandler cth = new CountingTripleHandler(false);
424         final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
425         final ExtractionReport report = any23.extract("http://www.legislation.gov.uk/ukpga/2015/17/section/4/data.xml",
426                 rth);
427         Assert.assertFalse(report.hasMatchingExtractors());
428         Assert.assertEquals(0, cth.getCount());
429     }
430 
431     @Test
432     public void testBlankNodesViaURL() throws IOException, ExtractionException {
433         assumeOnlineAllowed();
434         final Any23 any23 = new Any23();
435         any23.setHTTPUserAgent("apache-any23-test-user-agent");
436         final CountingTripleHandler cth = new CountingTripleHandler(false);
437         final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
438         final ExtractionReport report = any23.extract("https://www.w3.org/", rth);
439         Assert.assertTrue(report.hasMatchingExtractors());
440     }
441 
442     @Test
443     public void testMicrodataSupport() throws Exception {
444         final String htmlWithMicrodata = IOUtils
445                 .toString(getClass().getResourceAsStream("/microdata/microdata-basic.html"), StandardCharsets.UTF_8);
446         assertExtractorActivation(htmlWithMicrodata, MicrodataExtractor.class);
447     }
448 
449     @Test
450     public void testAbstractMethodErrorIssue186_1() throws IOException, ExtractionException {
451         final Any23 runner = new Any23();
452         final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-1.xhtml");
453         final DocumentSource source = new StringDocumentSource(content, "http://base.com");
454         final ByteArrayOutputStream out = new ByteArrayOutputStream();
455         final TripleHandler handler = new NTriplesWriter(out);
456         runner.extract(source, handler);
457         String n3 = out.toString("UTF-8");
458         logger.debug(n3);
459     }
460 
461     @Test
462     public void testAbstractMethodErrorIssue186_2() throws IOException, ExtractionException {
463         final Any23 runner = new Any23();
464         final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-2.xhtml");
465         final DocumentSource source = new StringDocumentSource(content, "http://richard.cyganiak.de/");
466         final ByteArrayOutputStream out = new ByteArrayOutputStream();
467         final TripleHandler handler = new NTriplesWriter(out);
468         runner.extract(source, handler);
469         final String n3 = out.toString("UTF-8");
470         logger.debug(n3);
471     }
472 
473     @Test
474     public void testModifiableConfiguration_issue183() throws Exception {
475         final ModifiableConfiguration modifiableConf = DefaultConfiguration.copy();
476         modifiableConf.setProperty("any23.extraction.metadata.timesize", "off");
477         final Any23 any23 = new Any23(modifiableConf);
478 
479         final String content = FileUtils.readResourceContent("/rdf/rdf-issue183.ttl");
480         final DocumentSource source = new StringDocumentSource(content, "http://base.com");
481         final ByteArrayOutputStream out = new ByteArrayOutputStream();
482         final TripleHandler handler = new NTriplesWriter(out);
483         any23.extract(source, handler);
484         handler.close();
485         final String n3 = out.toString("UTF-8");
486 
487         logger.debug(n3);
488         Assert.assertFalse("Should not contain triple with http://vocab.sindice.net/date",
489                 n3.contains("http://vocab.sindice.net/date"));
490         Assert.assertFalse("Should not contain triple with http://vocab.sindice.net/size",
491                 n3.contains("http://vocab.sindice.net/size"));
492     }
493 
494     @Test
495     public void testIssue415InvalidNTriples() throws Exception {
496         NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
497         Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
498 
499         ExtractionReport report = runner.extract(IOUtils.resourceToString("/rdf/issue415.txt", StandardCharsets.UTF_8),
500                 "http://humanstxt.org/humans.txt", new CompositeTripleHandler());
501         Assert.assertEquals("text/plain", report.getDetectedMimeType());
502         Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
503         Assert.assertEquals(0, report.getMatchingExtractors().size());
504     }
505 
506     @Test
507     public void testIssue415ValidNTriples() throws Exception {
508         NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
509         Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
510 
511         CountingTripleHandler handler = new CountingTripleHandler();
512         ExtractionReport report = runner.extract(
513                 IOUtils.resourceToString("/rdf/issue415-valid.txt", StandardCharsets.UTF_8),
514                 "http://humanstxt.org/humans.txt", handler);
515         Assert.assertEquals("application/n-triples", report.getDetectedMimeType());
516         Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
517         Assert.assertEquals(1, report.getMatchingExtractors().size());
518         Assert.assertEquals(1, handler.getCount());
519     }
520 
521     /**
522      * Test whether the {@link Any23} facade can be overloaded with a intentional <code>rdf-xml</code> extactor
523      * (mis)configuration which we then attempt to use to process a <code>application/xhtml+xml</code> document. The
524      * expected behaviour is that the call to
525      * {@link org.apache.any23.extractor.SingleDocumentExtraction#run(ExtractionParameters)} will ultimately filter the
526      * extractors based upon the detected mimetype. This results in no available extractors and a largely empty
527      * {@link ExtractionReport}.
528      * 
529      * @throws Exception
530      *             if a extraction anomaly arises
531      */
532     @Test
533     public void testMisconfiguredAny23FacadeForInputData() throws Exception {
534         Any23 runner = new Any23("rdf-xml");
535         CountingTripleHandler handler = new CountingTripleHandler();
536         ExtractionReport report = runner.extract(
537                 IOUtils.resourceToString("/html/BBC_News_Scotland.html", StandardCharsets.UTF_8),
538                 "http://www.bbc.co.uk/news/scotland/", handler);
539         Assert.assertEquals("application/xhtml+xml", report.getDetectedMimeType());
540         Assert.assertEquals(0, report.getExtractorIssues("rdf-xml").size());
541         Assert.assertEquals(0, report.getMatchingExtractors().size());
542         Assert.assertEquals(0, handler.getCount());
543         Assert.assertEquals(report.getValidationReport().getClass().getName(),
544                 "org.apache.any23.validator.EmptyValidationReport");
545     }
546 
547     /**
548      * Performs detection and extraction on the given input string and return the {@link ExtractionReport}.
549      * 
550      * @param in
551      *            input string.
552      * 
553      * @return a populated {@link org.apache.any23.ExtractionReport}
554      * 
555      * @throws Exception
556      *             if there is an error detecting mime type and running extraction
557      */
558     private ExtractionReport detectAndExtract(String in) throws Exception {
559         Any23 any23 = new Any23();
560         Configuration conf = DefaultConfiguration.copy();
561         ByteArrayOutputStream out = new ByteArrayOutputStream();
562         ReportingTripleHandler outputHandler = new ReportingTripleHandler(
563                 new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(new NTriplesWriter(out))));
564         return any23.extract(new ExtractionParameters(conf, ValidationMode.VALIDATE_AND_FIX, null, null),
565                 new StringDocumentSource(in, "http://host.com/path"), outputHandler, "UTF-8");
566     }
567 
568     /**
569      * Asserts that a list an {@link Extractor} has been activated for the given input data.
570      * 
571      * @param in
572      *            input data as string.
573      * 
574      * @throws IOException
575      * @throws ExtractionException
576      */
577     private void assertDetectionAndExtraction(String in) throws Exception {
578         final ExtractionReport extractionReport = detectAndExtract(in);
579         Assert.assertTrue("Detection and extraction failed, no matching extractors.",
580                 extractionReport.hasMatchingExtractors());
581     }
582 
583     /**
584      * Assert the correct activation of the given list of {@link Extractor}s for the given input string.
585      * 
586      * @param in
587      *            input data as string.
588      * @param expectedExtractors
589      * 
590      * @throws IOException
591      * @throws ExtractionException
592      */
593     private void assertExtractorActivation(String in,
594             @SuppressWarnings("rawtypes") Class<? extends Extractor>... expectedExtractors) throws Exception {
595         final ExtractionReport extractionReport = detectAndExtract(in);
596         for (@SuppressWarnings("rawtypes")
597         Class<? extends Extractor> expectedExtractorClass : expectedExtractors) {
598             Assert.assertTrue(
599                     String.format(Locale.ROOT, "Detection and extraction failed, expected extractor [%s] not found.",
600                             expectedExtractorClass),
601                     containsClass(extractionReport.getMatchingExtractors(), expectedExtractorClass));
602         }
603     }
604 
605     /**
606      * Asserts the correct encoding detection for a specified data.
607      * 
608      * @param encoding
609      *            the expected specified encoding, if <code>null</code> will be auto detected.
610      * @param input
611      * @param expectedContent
612      * 
613      * @throws Exception
614      */
615     private void assertEncodingDetection(String encoding, String input, String expectedContent) throws Exception {
616         DocumentSource fileDocumentSource = getDocumentSourceFromResource(input);
617         Any23 any23;
618         RepositoryConnection conn = null;
619         RepositoryWriter repositoryWriter = null;
620 
621         any23 = new Any23();
622         Repository store = new SailRepository(new MemoryStore());
623         store.init();
624         try {
625             conn = store.getConnection();
626             repositoryWriter = new RepositoryWriter(conn);
627             Assert.assertTrue(any23.extract(fileDocumentSource, repositoryWriter, encoding).hasMatchingExtractors());
628 
629             RepositoryResult<Statement> statements = conn.getStatements(null, vDCTERMS.title, null, false);
630             try {
631                 while (statements.hasNext()) {
632                     Statement statement = statements.next();
633                     printStatement(statement);
634                     Assert.assertTrue(statement.getObject().stringValue().contains(expectedContent));
635                 }
636             } finally {
637                 statements.close();
638             }
639         } finally {
640             if (conn != null) {
641                 conn.close();
642             }
643             if (repositoryWriter != null) {
644                 repositoryWriter.close();
645             }
646         }
647         fileDocumentSource = null;
648         any23 = null;
649     }
650 
651     /**
652      * Will try to detect the <i>content</i> trying sequentially with all specified parser.
653      * 
654      * @param content
655      * @param parsers
656      * 
657      * @throws Exception
658      */
659     private void assertDetection(String content, String... parsers) throws Exception {
660         ByteArrayOutputStream out = new ByteArrayOutputStream();
661         Any23 runner = new Any23(parsers.length == 0 ? null : parsers);
662         if (parsers.length != 0) {
663             runner.setMIMETypeDetector(null); // Use all the provided
664                                               // extractors.
665         }
666         final NTriplesWriter tripleHandler = new NTriplesWriter(out);
667         runner.extract(new StringDocumentSource(content, PAGE_URL), tripleHandler);
668         tripleHandler.close();
669         String result = out.toString("us-ascii");
670         Assert.assertNotNull(result);
671         Assert.assertTrue(result.length() > 10);
672     }
673 
674     private void printStatement(Statement statement) {
675         logger.debug(String.format(Locale.ROOT, "%s\t%s\t%s", statement.getSubject(), statement.getPredicate(),
676                 statement.getObject()));
677     }
678 
679     private boolean containsClass(List<?> list, Class<?> clazz) {
680         for (Object o : list) {
681             if (o.getClass().equals(clazz)) {
682                 return true;
683             }
684         }
685         return false;
686     }
687 
688 }