1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.plugin.htmlscraper;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.junit.After;
25 import org.junit.Assert;
26 import org.junit.Before;
27 import org.junit.Test;
28 import org.eclipse.rdf4j.model.IRI;
29 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
30
31 import java.io.IOException;
32 import java.io.InputStream;
33 import java.util.Arrays;
34 import java.util.HashSet;
35
36 import static org.mockito.Mockito.any;
37 import static org.mockito.Mockito.eq;
38 import static org.mockito.Mockito.mock;
39 import static org.mockito.Mockito.verify;
40
41
42
43
44
45
46 public class HTMLScraperExtractorTest {
47
48 private HTMLScraperExtractor extractor;
49
50 @Before
51 public void setUp() {
52 extractor = new HTMLScraperExtractorFactory().createExtractor();
53 }
54
55 @After
56 public void tearDown() {
57 extractor = null;
58 }
59
60 @Test
61 public void testGetExtractors() {
62 final String[] extractors = extractor.getTextExtractors();
63 Assert.assertEquals( new HashSet<>(Arrays.asList(extractors)).size(), 4 );
64 }
65
66 @Test
67 public void testRun() throws IOException, ExtractionException {
68 final InputStream is = this.getClass().getResourceAsStream("html-scraper-extractor-test.html");
69 final ExtractionResult extractionResult = mock(ExtractionResult.class);
70 final IRI pageIRI = SimpleValueFactory.getInstance().createIRI("http://fake/test/page/testrun");
71 final ExtractionContext extractionContext = new ExtractionContext(
72 extractor.getDescription().getExtractorName(),
73 pageIRI
74 );
75 extractor.run(ExtractionParameters.newDefault(), extractionContext, is, extractionResult);
76
77 verify(extractionResult).writeTriple(
78 eq(pageIRI), eq(HTMLScraperExtractor.PAGE_CONTENT_DE_PROPERTY), any());
79 verify(extractionResult).writeTriple(
80 eq(pageIRI), eq(HTMLScraperExtractor.PAGE_CONTENT_AE_PROPERTY), any());
81 verify(extractionResult).writeTriple(
82 eq(pageIRI), eq(HTMLScraperExtractor.PAGE_CONTENT_LCE_PROPERTY), any());
83 verify(extractionResult).writeTriple(
84 eq(pageIRI), eq(HTMLScraperExtractor.PAGE_CONTENT_CE_PROPERTY), any());
85 }
86
87 }