1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.xpath;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.Extractor;
25 import org.apache.any23.extractor.ExtractorDescription;
26 import org.eclipse.rdf4j.model.IRI;
27 import org.w3c.dom.Document;
28
29 import java.io.IOException;
30 import java.util.ArrayList;
31 import java.util.List;
32
33
34
35
36
37
38
39
40
41 public class XPathExtractor implements Extractor.TagSoupDOMExtractor {
42
43 private final List<XPathExtractionRule> xPathExtractionRules = new ArrayList<>();
44
45 public XPathExtractor() {
46
47 }
48
49 public XPathExtractor(List<XPathExtractionRule> rules) {
50 xPathExtractionRules.addAll(rules);
51 }
52
53 public void add(XPathExtractionRule rule) {
54 xPathExtractionRules.add(rule);
55 }
56
57 public void remove(XPathExtractionRule rule) {
58 xPathExtractionRules.remove(rule);
59 }
60
61 public boolean contains(XPathExtractionRule rule) {
62 return xPathExtractionRules.contains(rule);
63 }
64
65 @Override
66 public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
67 ExtractionResult out) throws IOException, ExtractionException {
68 final IRI documentIRI = extractionContext.getDocumentIRI();
69 for (XPathExtractionRule rule : xPathExtractionRules) {
70 if (rule.acceptIRI(documentIRI)) {
71 rule.process(in, out);
72 }
73 }
74 }
75
76 @Override
77 public ExtractorDescription getDescription() {
78 return XPathExtractorFactory.getDescriptionInstance();
79 }
80
81 }