1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.Extractor;
25 import org.apache.any23.extractor.ExtractorDescription;
26 import org.apache.any23.rdf.RDFUtils;
27 import org.apache.any23.vocab.SINDICE;
28 import org.eclipse.rdf4j.model.IRI;
29 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
30 import org.w3c.dom.Document;
31 import org.w3c.dom.NamedNodeMap;
32 import org.w3c.dom.Node;
33
34 import java.io.IOException;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.List;
38 import java.util.Map;
39 import java.util.Set;
40
41
42
43
44
45
46
47 public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
48
49 private static final SINDICE vSINDICE = SINDICE.getInstance();
50
51 private IRI profile;
52
53 private Map<String, IRI> prefixes = new HashMap<>();
54
55 private String documentLang;
56
57
58
59
60 @Override
61 public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
62 ExtractionResult out) throws IOException, ExtractionException {
63 profile = extractProfile(in);
64 documentLang = getDocumentLanguage(in);
65 extractLinkDefinedPrefixes(in);
66
67 String baseProfile = vSINDICE.NS;
68 if (profile != null) {
69 baseProfile = profile.toString();
70 }
71
72 final IRI documentIRI = extractionContext.getDocumentIRI();
73 Set<Meta> metas = extractMetaElement(in, baseProfile);
74 for (Meta meta : metas) {
75 String lang = documentLang;
76 if (meta.getLang() != null) {
77 lang = meta.getLang();
78 }
79 if (meta.isPragmaDirective) {
80 if (lang != null) {
81 out.writeTriple(documentIRI, meta.getHttpEquiv(),
82 SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
83 } else {
84 out.writeTriple(documentIRI, meta.getHttpEquiv(),
85 SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
86 }
87 } else {
88 if (lang != null) {
89 out.writeTriple(documentIRI, meta.getName(),
90 SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
91 } else {
92 out.writeTriple(documentIRI, meta.getName(),
93 SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
94 }
95 }
96 }
97 }
98
99
100
101
102
103
104
105
106
107 private String getDocumentLanguage(Document in) {
108 String lang = DomUtils.find(in, "string(/HTML/@lang)");
109 if ("".equals(lang)) {
110 return null;
111 }
112 return lang;
113 }
114
115 private IRI extractProfile(Document in) {
116 String profile = DomUtils.find(in, "string(/HTML/@profile)");
117 if ("".equals(profile)) {
118 return null;
119 }
120 return SimpleValueFactory.getInstance().createIRI(profile);
121 }
122
123
124
125
126
127
128 private void extractLinkDefinedPrefixes(Document in) {
129 List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
130 for (Node linkNode : linkNodes) {
131 NamedNodeMap attributes = linkNode.getAttributes();
132 Node relNode = attributes.getNamedItem("rel");
133 String rel = relNode == null ? null : relNode.getTextContent();
134 Node hrefNode = attributes.getNamedItem("href");
135 String href = hrefNode == null ? null : hrefNode.getTextContent();
136 if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
137 prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
138 }
139 }
140 }
141
142 private Set<Meta> extractMetaElement(Document in, String baseProfile) {
143 List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
144 Set<Meta> result = new HashSet<>();
145 for (Node metaNode : metaNodes) {
146 NamedNodeMap attributes = metaNode.getAttributes();
147 Node nameAttribute = attributes.getNamedItem("name");
148 Node httpEquivAttribute = attributes.getNamedItem("http-equiv");
149 Node contentAttribute = attributes.getNamedItem("content");
150 if (nameAttribute == null && httpEquivAttribute == null)
151 continue;
152 if (nameAttribute != null || httpEquivAttribute != null) {
153 if (contentAttribute == null) {
154 continue;
155 }
156 }
157 boolean isPragmaDirective = (httpEquivAttribute != null) ? true : false;
158 if (isPragmaDirective) {
159 String httpEquiv = httpEquivAttribute.getTextContent();
160 String content = contentAttribute.getTextContent();
161 String xpath = DomUtils.getXPathForNode(metaNode);
162 IRI httpEquivAsIRI = getPrefixIfExists(httpEquiv);
163 if (httpEquivAsIRI == null) {
164 httpEquivAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + httpEquiv);
165 }
166 Meta meta = new Meta(xpath, content, httpEquivAsIRI);
167 result.add(meta);
168 } else {
169 String name = nameAttribute.getTextContent();
170 String content = contentAttribute.getTextContent();
171 String xpath = DomUtils.getXPathForNode(metaNode);
172 IRI nameAsIRI = getPrefixIfExists(name);
173 if (nameAsIRI == null) {
174 nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
175 }
176 Meta meta = new Meta(xpath, nameAsIRI, content);
177 result.add(meta);
178 }
179 }
180 return result;
181 }
182
183 private IRI getPrefixIfExists(String name) {
184 String[] split = name.split("\\.");
185 if (split.length == 2 && prefixes.containsKey(split[0])) {
186 return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
187 }
188 return null;
189 }
190
191 @Override
192 public ExtractorDescription getDescription() {
193 return HTMLMetaExtractorFactory.getDescriptionInstance();
194 }
195
196 private static class Meta {
197
198 private String xpath;
199
200 private IRI name;
201
202 private IRI httpEquiv;
203
204 private String lang;
205
206 private String content;
207
208 private boolean isPragmaDirective;
209
210 public Meta(String xpath, String content, IRI httpEquiv) {
211 this.xpath = xpath;
212 this.content = content;
213 this.httpEquiv = httpEquiv;
214 this.setPragmaDirective(true);
215 }
216
217 @SuppressWarnings("unused")
218 public Meta(String xpath, String content, IRI httpEquiv, String lang) {
219 this(xpath, content, httpEquiv);
220 this.lang = lang;
221 }
222
223 public Meta(String xpath, IRI name, String content) {
224 this.xpath = xpath;
225 this.name = name;
226 this.content = content;
227 }
228
229 @SuppressWarnings("unused")
230 public Meta(String xpath, IRI name, String content, String lang) {
231 this(xpath, name, content);
232 this.lang = lang;
233 }
234
235 private void setPragmaDirective(boolean value) {
236 this.isPragmaDirective = value;
237 }
238
239 public IRI getHttpEquiv() {
240 return httpEquiv;
241 }
242
243 public IRI getName() {
244 return name;
245 }
246
247 public String getLang() {
248 return lang;
249 }
250
251 public String getContent() {
252 return content;
253 }
254
255 @Override
256 public boolean equals(Object o) {
257 if (this == o)
258 return true;
259 if (o == null || getClass() != o.getClass())
260 return false;
261
262 Meta meta = (Meta) o;
263
264 if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null)
265 return false;
266
267 return true;
268 }
269
270 @Override
271 public int hashCode() {
272 return xpath != null ? xpath.hashCode() : 0;
273 }
274 }
275
276 }