1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.Extractor;
25 import org.apache.any23.extractor.ExtractorDescription;
26 import org.apache.any23.extractor.rdf.JSONLDExtractor;
27 import org.apache.any23.extractor.rdf.JSONLDExtractorFactory;
28 import org.apache.any23.rdf.RDFUtils;
29 import org.apache.any23.vocab.SINDICE;
30 import org.apache.commons.io.IOUtils;
31 import org.eclipse.rdf4j.model.IRI;
32 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
33 import org.w3c.dom.Document;
34 import org.w3c.dom.NamedNodeMap;
35 import org.w3c.dom.Node;
36
37 import java.io.IOException;
38 import java.nio.charset.StandardCharsets;
39 import java.util.HashMap;
40 import java.util.HashSet;
41 import java.util.List;
42 import java.util.Map;
43 import java.util.Set;
44
45
46
47
48
49
50
51 public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {
52
53 private static final SINDICE vSINDICE = SINDICE.getInstance();
54
55 private IRI profile;
56
57 private Map<String, IRI> prefixes = new HashMap<>();
58
59 private String documentLang;
60
61 private JSONLDExtractor extractor;
62
63
64
65
66 @Override
67 public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
68 ExtractionResult out) throws IOException, ExtractionException {
69 profile = extractProfile(in);
70 documentLang = getDocumentLanguage(in);
71 extractLinkDefinedPrefixes(in);
72
73 String baseProfile = vSINDICE.NS;
74 if (profile != null) {
75 baseProfile = profile.toString();
76 }
77
78 extractionContext.getDocumentIRI();
79 Set<JSONLDScript> jsonldScripts = extractJSONLDScript(in, baseProfile, extractionParameters, extractionContext,
80 out);
81 for (JSONLDScript jsonldScript : jsonldScripts) {
82
83
84
85
86
87
88 }
89 }
90
91
92
93
94
95
96
97
98
99 private String getDocumentLanguage(Document in) {
100 String lang = DomUtils.find(in, "string(/HTML/@lang)");
101 if ("".equals(lang)) {
102 return null;
103 }
104 return lang;
105 }
106
107 private IRI extractProfile(Document in) {
108 String profile = DomUtils.find(in, "string(/HTML/@profile)");
109 if ("".equals(profile)) {
110 return null;
111 }
112 return SimpleValueFactory.getInstance().createIRI(profile);
113 }
114
115
116
117
118
119
120 private void extractLinkDefinedPrefixes(Document in) {
121 List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
122 for (Node linkNode : linkNodes) {
123 NamedNodeMap attributes = linkNode.getAttributes();
124 Node relNode = attributes.getNamedItem("rel");
125 String rel = relNode == null ? null : relNode.getTextContent();
126 Node hrefNode = attributes.getNamedItem("href");
127 String href = hrefNode == null ? null : hrefNode.getTextContent();
128 if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
129 prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
130 }
131 }
132 }
133
134 private Set<JSONLDScript> extractJSONLDScript(Document in, String baseProfile,
135 ExtractionParameters extractionParameters, ExtractionContext extractionContext, ExtractionResult out)
136 throws IOException, ExtractionException {
137 List<Node> scriptNodes = DomUtils.findAll(in, "//SCRIPT");
138 Set<JSONLDScript> result = new HashSet<>();
139 extractor = new JSONLDExtractorFactory().createExtractor();
140 for (Node jsonldNode : scriptNodes) {
141 NamedNodeMap attributes = jsonldNode.getAttributes();
142 for (int i = 0; i < attributes.getLength(); i++) {
143 if ("application/ld+json".equalsIgnoreCase(attributes.item(i).getTextContent())) {
144 extractor.run(extractionParameters, extractionContext,
145 IOUtils.toInputStream(jsonldNode.getTextContent(), StandardCharsets.UTF_8), out);
146 }
147 }
148 Node nameAttribute = attributes.getNamedItem("name");
149 Node contentAttribute = attributes.getNamedItem("content");
150 if (nameAttribute == null || contentAttribute == null) {
151 continue;
152 }
153 String name = nameAttribute.getTextContent();
154 String content = contentAttribute.getTextContent();
155 String xpath = DomUtils.getXPathForNode(jsonldNode);
156 IRI nameAsIRI = getPrefixIfExists(name);
157 if (nameAsIRI == null) {
158 nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
159 }
160 JSONLDScript jsonldScript = new JSONLDScript(xpath, nameAsIRI, content);
161 result.add(jsonldScript);
162 }
163 return result;
164 }
165
166 private IRI getPrefixIfExists(String name) {
167 String[] split = name.split("\\.");
168 if (split.length == 2 && prefixes.containsKey(split[0])) {
169 return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
170 }
171 return null;
172 }
173
174 @Override
175 public ExtractorDescription getDescription() {
176 return EmbeddedJSONLDExtractorFactory.getDescriptionInstance();
177 }
178
179 private static class JSONLDScript {
180
181 private String xpath;
182
183 public JSONLDScript(String xpath, IRI name, String content) {
184 this.xpath = xpath;
185 }
186
187 @Override
188 public boolean equals(Object o) {
189 if (this == o) {
190 return true;
191 }
192 if (o == null) {
193 return false;
194 }
195 if (!(o instanceof JSONLDScript)) {
196 return false;
197 }
198
199 JSONLDScript meta = (JSONLDScript) o;
200
201 if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) {
202 return false;
203 }
204
205 return true;
206 }
207
208 @Override
209 public int hashCode() {
210 return xpath != null ? xpath.hashCode() : 0;
211 }
212 }
213
214 }