1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.microdata;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.Extractor;
25 import org.apache.any23.extractor.ExtractorDescription;
26 import org.apache.any23.extractor.IssueReport;
27 import org.apache.any23.rdf.RDFUtils;
28 import org.eclipse.rdf4j.common.net.ParsedIRI;
29 import org.eclipse.rdf4j.model.IRI;
30 import org.eclipse.rdf4j.model.Resource;
31 import org.eclipse.rdf4j.model.Value;
32 import org.eclipse.rdf4j.model.vocabulary.RDF;
33 import org.w3c.dom.Document;
34
35 import java.io.IOException;
36 import java.net.URISyntaxException;
37 import java.util.HashMap;
38 import java.util.List;
39 import java.util.Map;
40 import java.util.Optional;
41
42
43
44
45
46
47
48
49
50 public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
51
52 static final IRI MICRODATA_ITEM = RDFUtils.iri("http://www.w3.org/1999/xhtml/microdata#item");
53
54 private static final ParsedIRI EMPTY_FRAG = ParsedIRI.create("#");
55
56 @Override
57 public ExtractorDescription getDescription() {
58 return MicrodataExtractorFactory.getDescriptionInstance();
59 }
60
61
62
63
64
65 @Override
66 public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
67 ExtractionResult out) throws IOException, ExtractionException {
68
69 final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in);
70 if (parserReport.getErrors().length > 0) {
71 notifyError(parserReport.getErrors(), out);
72 }
73 final ItemScope[] itemScopes = parserReport.getDetectedItemScopes();
74 if (itemScopes.length == 0) {
75 return;
76 }
77
78 final IRI documentIRI = extractionContext.getDocumentIRI();
79 final ParsedIRI parsedDocumentIRI = ParsedIRI.create(documentIRI.stringValue());
80
81 boolean isStrict = extractionParameters.getFlag("any23.microdata.strict");
82 final IRI defaultNamespace;
83 if (!isStrict) {
84 defaultNamespace = RDFUtils.iri(extractionParameters.getProperty("any23.microdata.ns.default"));
85 if (!defaultNamespace.getLocalName().isEmpty()) {
86 throw new IllegalArgumentException("invalid namespace IRI: " + defaultNamespace);
87 }
88 } else {
89
90 defaultNamespace = RDFUtils.iri(parsedDocumentIRI.resolve(EMPTY_FRAG).toString());
91 }
92
93
94 final Map<ItemScope, Resource> mappings = new HashMap<>();
95 for (ItemScope itemScope : itemScopes) {
96 Resource subject = processType(itemScope, parsedDocumentIRI, out, mappings, defaultNamespace);
97
98
99
100 out.writeTriple(documentIRI, MICRODATA_ITEM, subject);
101 }
102 }
103
104
105
106
107
108 private Resource processType(ItemScope itemScope, ParsedIRI documentIRI, ExtractionResult out,
109 Map<ItemScope, Resource> mappings, IRI defaultNamespace) throws ExtractionException {
110 Resource subject = mappings.computeIfAbsent(itemScope,
111 scope -> createSubjectForItemId(documentIRI, scope.getItemId()));
112
113 List<IRI> itemScopeTypes = itemScope.getTypes();
114 if (!itemScopeTypes.isEmpty()) {
115 defaultNamespace = getNamespaceIRI(itemScopeTypes.get(0));
116 for (IRI type : itemScopeTypes) {
117 out.writeTriple(subject, RDF.TYPE, type);
118 }
119 }
120 for (Map.Entry<String, List<ItemProp>> itemProps : itemScope.getProperties().entrySet()) {
121 String propName = itemProps.getKey();
122 IRI predicate = getPredicate(defaultNamespace, propName);
123 if (predicate == null) {
124 continue;
125 }
126 for (ItemProp itemProp : itemProps.getValue()) {
127 try {
128 processProperty(subject, predicate, itemProp, documentIRI, mappings, out, defaultNamespace);
129 } catch (URISyntaxException e) {
130 throw new ExtractionException(
131 "Error while processing on subject '" + subject + "' the itemProp: '" + itemProp + "' ");
132 }
133 }
134 }
135 return subject;
136 }
137
138 private static Resource createSubjectForItemId(ParsedIRI documentIRI, String itemId) {
139 if (itemId == null) {
140 return RDFUtils.bnode();
141 }
142 try {
143 return toAbsoluteIRI(documentIRI, itemId);
144 } catch (URISyntaxException e) {
145 return RDFUtils.bnode();
146 }
147 }
148
149 private void processProperty(Resource subject, IRI predicate, ItemProp itemProp, ParsedIRI documentIRI,
150 Map<ItemScope, Resource> mappings, ExtractionResult out, IRI defaultNamespace)
151 throws URISyntaxException, ExtractionException {
152
153 Value value;
154 Object propValue = itemProp.getValue().getContent();
155 ItemPropValue.Type propType = itemProp.getValue().getType();
156 if (itemProp.getValue().literal != null) {
157 value = itemProp.getValue().literal;
158 } else if (propType.equals(ItemPropValue.Type.Nested)) {
159 value = processType((ItemScope) propValue, documentIRI, out, mappings, defaultNamespace);
160 } else if (propType.equals(ItemPropValue.Type.Link)) {
161 value = toAbsoluteIRI(documentIRI, (String) propValue);
162
163 if (predicate.stringValue().equals("http://schema.org/additionalType")) {
164 if (itemProp.reverse) {
165 out.writeTriple((Resource) value, RDF.TYPE, subject);
166 } else {
167 out.writeTriple(subject, RDF.TYPE, value);
168 }
169 }
170 } else {
171 throw new RuntimeException(
172 "Invalid Type '" + propType + "' for ItemPropValue with name: '" + predicate + "'");
173 }
174 if (itemProp.reverse) {
175 out.writeTriple((Resource) value, predicate, subject);
176 } else {
177 out.writeTriple(subject, predicate, value);
178 }
179 }
180
181 private static final String hcardPrefix = "http://microformats.org/profile/hcard";
182 private static final IRI hcardNamespaceIRI = RDFUtils.iri("http://microformats.org/profile/hcard#");
183
184 private static IRI getNamespaceIRI(IRI itemType) {
185
186 return itemType.stringValue().startsWith(hcardPrefix) ? hcardNamespaceIRI : itemType;
187 }
188
189 private static IRI getPredicate(IRI namespaceIRI, String localName) {
190 return toAbsoluteIRI(localName).orElseGet(
191 () -> namespaceIRI == null ? null : RDFUtils.iri(namespaceIRI.getNamespace(), localName.trim()));
192 }
193
194 private static Optional<IRI> toAbsoluteIRI(String urlString) {
195 if (urlString != null) {
196 try {
197 ParsedIRI iri = ParsedIRI.create(urlString.trim());
198 if (iri.isAbsolute()) {
199 return Optional.of(RDFUtils.iri(iri.toString()));
200 }
201 } catch (RuntimeException e) {
202
203 }
204 }
205 return Optional.empty();
206 }
207
208 private static IRI toAbsoluteIRI(ParsedIRI documentIRI, String part) throws URISyntaxException {
209 try {
210 return RDFUtils.iri(documentIRI.resolve(part.trim()));
211 } catch (RuntimeException e) {
212 if (e.getCause() instanceof URISyntaxException) {
213 throw (URISyntaxException) e.getCause();
214 } else {
215 throw new URISyntaxException(String.valueOf(part),
216 e.getClass().getName() + (e.getMessage() != null ? ": " + e.getMessage() : ""));
217 }
218 }
219 }
220
221 private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
222 for (MicrodataParserException mpe : errors) {
223 out.notifyIssue(IssueReport.IssueLevel.ERROR, mpe.toJSON(), mpe.getErrorLocationBeginRow(),
224 mpe.getErrorLocationBeginCol());
225 }
226 }
227
228 }