1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.plugin.htmlscraper;
19
20 import de.l3s.boilerpipe.BoilerpipeExtractor;
21 import de.l3s.boilerpipe.BoilerpipeProcessingException;
22 import de.l3s.boilerpipe.extractors.ArticleExtractor;
23 import de.l3s.boilerpipe.extractors.CanolaExtractor;
24 import de.l3s.boilerpipe.extractors.DefaultExtractor;
25 import de.l3s.boilerpipe.extractors.LargestContentExtractor;
26 import org.apache.any23.extractor.ExtractionContext;
27 import org.apache.any23.extractor.ExtractionException;
28 import org.apache.any23.extractor.ExtractionParameters;
29 import org.apache.any23.extractor.ExtractionResult;
30 import org.apache.any23.extractor.Extractor;
31 import org.apache.any23.extractor.ExtractorDescription;
32 import org.apache.any23.vocab.SINDICE;
33 import org.eclipse.rdf4j.model.IRI;
34 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
35
36 import java.io.IOException;
37 import java.io.InputStream;
38 import java.io.InputStreamReader;
39 import java.util.ArrayList;
40 import java.util.List;
41
42
43
44
45
46
47 public class HTMLScraperExtractor implements Extractor.ContentExtractor {
48
49 public static final IRI PAGE_CONTENT_DE_PROPERTY =
50 SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/de");
51 public static final IRI PAGE_CONTENT_AE_PROPERTY =
52 SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/ae");
53 public static final IRI PAGE_CONTENT_LCE_PROPERTY =
54 SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/lce");
55 public static final IRI PAGE_CONTENT_CE_PROPERTY =
56 SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/ce");
57
58 private final List<ExtractionRule> extractionRules = new ArrayList<>();
59
60 public HTMLScraperExtractor() {
61 loadDefaultRules();
62 }
63
64 public void addTextExtractor(String name, IRI property, BoilerpipeExtractor extractor) {
65 extractionRules.add( new ExtractionRule(name, property, extractor) );
66 }
67
68 public String[] getTextExtractors() {
69 final List<String> extractors = new ArrayList<>();
70 for(ExtractionRule er : extractionRules) {
71 extractors.add(er.name);
72 }
73 return extractors.toArray( new String[extractors.size()] );
74 }
75
76 @Override
77 public void run(
78 ExtractionParameters extractionParameters,
79 ExtractionContext extractionContext,
80 InputStream inputStream,
81 ExtractionResult extractionResult
82 ) throws IOException, ExtractionException {
83 try {
84 final IRI documentIRI = extractionContext.getDocumentIRI();
85 for (ExtractionRule extractionRule : extractionRules) {
86 final String content = extractionRule.boilerpipeExtractor.getText(new InputStreamReader(inputStream));
87 extractionResult.writeTriple(
88 documentIRI,
89 extractionRule.property,
90 SimpleValueFactory.getInstance().createLiteral(content)
91 );
92 }
93 } catch (BoilerpipeProcessingException bpe) {
94 throw new ExtractionException("Error while applying text processor " + ArticleExtractor.class, bpe);
95 }
96 }
97
98 @Override
99 public ExtractorDescription getDescription() {
100 return HTMLScraperExtractorFactory.getDescriptionInstance();
101 }
102
103 @Override
104 public void setStopAtFirstError(boolean b) {
105
106 }
107
108 private void loadDefaultRules() {
109 addTextExtractor("default-extractor" , PAGE_CONTENT_DE_PROPERTY , DefaultExtractor.getInstance());
110 addTextExtractor("article-extractor" , PAGE_CONTENT_AE_PROPERTY , ArticleExtractor.getInstance());
111 addTextExtractor("large-content-extractor", PAGE_CONTENT_LCE_PROPERTY, LargestContentExtractor.getInstance());
112 addTextExtractor("canola-extractor" , PAGE_CONTENT_CE_PROPERTY , CanolaExtractor.getInstance());
113 }
114
115
116
117
118 class ExtractionRule {
119
120 public final String name;
121 public final IRI property;
122 public final BoilerpipeExtractor boilerpipeExtractor;
123
124 ExtractionRule(String name, IRI property, BoilerpipeExtractor boilerpipeExtractor) {
125 if(name == null) {
126 throw new NullPointerException("name cannot be null.");
127 }
128 if(property == null) {
129 throw new NullPointerException("property cannot be null.");
130 }
131 if(boilerpipeExtractor == null) {
132 throw new NullPointerException("extractor cannot be null.");
133 }
134 this.name = name;
135 this.property = property;
136 this.boilerpipeExtractor = boilerpipeExtractor;
137 }
138
139 }
140 }