1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.xpath;
19
20 import org.apache.any23.extractor.ExtractionResult;
21 import org.apache.any23.extractor.html.DomUtils;
22 import org.eclipse.rdf4j.model.IRI;
23 import org.w3c.dom.Document;
24
25 import java.util.ArrayList;
26 import java.util.HashMap;
27 import java.util.List;
28 import java.util.Locale;
29 import java.util.Map;
30 import java.util.regex.Pattern;
31
32
33
34
35
36
37 public class TemplateXPathExtractionRuleImpl implements TemplateXPathExtractionRule {
38
39 private final String name;
40
41 private final String uriRegex;
42
43 private final Pattern uriRegexPattern;
44
45 private final List<Variable> variables;
46
47 private final List<QuadTemplate> templates;
48
49 public TemplateXPathExtractionRuleImpl(String name, String uriRegex) {
50 if (name == null) {
51 throw new NullPointerException("The rule name cannot be null.");
52 }
53
54 this.name = name;
55 this.uriRegex = uriRegex;
56
57 try {
58 uriRegexPattern = uriRegex != null ? Pattern.compile(uriRegex) : null;
59 } catch (Exception e) {
60 throw new IllegalArgumentException("Invalid value for uriRegex.", e);
61 }
62 variables = new ArrayList<Variable>();
63 templates = new ArrayList<QuadTemplate>();
64 }
65
66
67
68
69 public String getUriRegex() {
70 return uriRegex;
71 }
72
73 public void add(Variable variable) {
74 checkVariableNameNotDeclared(variable.getName());
75 variables.add(variable);
76 }
77
78 public boolean remove(Variable variable) {
79 return variables.remove(variable);
80 }
81
82 public void add(QuadTemplate template) {
83 checkTemplateVariablesDeclared(template);
84 templates.add(template);
85 }
86
87 public boolean remove(QuadTemplate template) {
88 return templates.remove(template);
89 }
90
91 public String getName() {
92 return name;
93 }
94
95 public boolean acceptIRI(IRI uri) {
96 if (uriRegexPattern == null) {
97 return true;
98 }
99 return uriRegexPattern.matcher(uri.stringValue()).find();
100 }
101
102 public void process(Document in, ExtractionResult er) {
103 final Map<String, String> varValues = new HashMap<String, String>();
104 String value;
105 for (Variable variable : variables) {
106 value = DomUtils.find(in, variable.getxPath().toUpperCase(Locale.ROOT));
107 varValues.put(variable.getName(), value);
108 }
109
110 for (QuadTemplate template : templates) {
111 template.printOut(er, varValues);
112 }
113 }
114
115 private boolean variableNameDeclared(String varName) {
116 for (Variable variable : variables) {
117 if (variable.getName().equals(varName)) {
118 return true;
119 }
120 }
121 return false;
122 }
123
124 private void checkVariableNameDeclared(String varName) {
125 if (!variableNameDeclared(varName)) {
126 throw new IllegalArgumentException(
127 String.format(Locale.ROOT, "A variable with name '%s' was not declared.", varName));
128 }
129 }
130
131 private void checkVariableNameNotDeclared(String varName) {
132 if (variableNameDeclared(varName)) {
133 throw new IllegalArgumentException(
134 String.format(Locale.ROOT, "A variable with name '%s' is already declared.", varName));
135 }
136 }
137
138 private void checkTemplateVariablesDeclared(QuadTemplate template) {
139 if (template.getSubject().isVar())
140 checkVariableNameDeclared(template.getSubject().getInternalValue());
141 if (template.getPredicate().isVar())
142 checkVariableNameDeclared(template.getPredicate().getInternalValue());
143 if (template.getObject().isVar())
144 checkVariableNameDeclared(template.getObject().getInternalValue());
145 if (template.getGraph() != null && template.getGraph().isVar()) {
146 checkVariableNameDeclared(template.getGraph().getInternalValue());
147 }
148 }
149
150 @Override
151 public String toString() {
152 final StringBuilder sb = new StringBuilder();
153 sb.append('\n');
154 sb.append("name: ").append(name).append('\n');
155 sb.append("pattern: '").append(uriRegex).append("'").append('\n');
156
157 sb.append("variables {\n");
158 for (Variable variable : variables) {
159 sb.append(variable.getName()).append(":").append(variable.getxPath()).append('\n');
160 }
161 sb.append("}\n");
162
163 sb.append("templates {\n");
164 for (QuadTemplate template : templates) {
165 sb.append(template).append('\n');
166 }
167 sb.append("}\n");
168 return sb.toString();
169 }
170 }