1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.any23.extractor.yaml;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import org.apache.any23.extractor.ExtractionContext;
22 import org.apache.any23.extractor.ExtractionException;
23 import org.apache.any23.extractor.ExtractionParameters;
24 import org.apache.any23.extractor.ExtractionResult;
25 import org.apache.any23.extractor.Extractor;
26 import org.apache.any23.extractor.ExtractorDescription;
27 import org.apache.any23.rdf.RDFUtils;
28 import org.apache.any23.vocab.YAML;
29 import org.eclipse.rdf4j.model.Resource;
30 import org.eclipse.rdf4j.model.IRI;
31 import org.eclipse.rdf4j.model.vocabulary.RDF;
32 import org.eclipse.rdf4j.model.vocabulary.RDFS;
33 import org.slf4j.Logger;
34 import org.slf4j.LoggerFactory;
35 import org.yaml.snakeyaml.Yaml;
36 import org.yaml.snakeyaml.constructor.SafeConstructor;
37
38
39
40
41 public class YAMLExtractor implements Extractor.ContentExtractor {
42
43 private final Logger log = LoggerFactory.getLogger(getClass());
44
45 private static final Yaml yml = new Yaml(new SafeConstructor());
46
47 private static final YAML vocab = YAML.getInstance();
48
49 private final ElementsProcessor ep = ElementsProcessor.getInstance();
50
51 private Resource documentRoot;
52
53 @Override
54 public void setStopAtFirstError(boolean f) {
55 }
56
57 @Override
58 public void run(ExtractionParameters extractionParameters, ExtractionContext context, InputStream in,
59 ExtractionResult out) throws IOException, ExtractionException {
60
61 IRI documentIRI = context.getDocumentIRI();
62 documentRoot = RDFUtils.iri(documentIRI.toString() + "root");
63
64 log.debug("Processing: {}", documentIRI.toString());
65 out.writeNamespace(vocab.PREFIX, vocab.NS);
66 out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
67 out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
68
69 out.writeTriple(documentRoot, RDF.TYPE, vocab.root);
70 Iterable<Object> docIterate = yml.loadAll(in);
71
72
73 for (Object p : docIterate) {
74 Resource pageNode = RDFUtils.makeIRI("document", documentIRI, true);
75 out.writeTriple(documentRoot, vocab.contains, pageNode);
76 out.writeTriple(pageNode, RDF.TYPE, vocab.document);
77 ElementsProcessor.ModelHolder rootNode = ep.asModel(documentIRI, p, pageNode);
78
79 if (rootNode == null) {
80 continue;
81 }
82
83 if (!rootNode.getRoot().equals(pageNode)) {
84 out.writeTriple(pageNode, vocab.contains, rootNode.getRoot());
85 }
86
87 log.debug("Subgraph root node: {}", rootNode.getRoot().stringValue());
88
89 rootNode.getModel().forEach((s) -> {
90 out.writeTriple(s.getSubject(), s.getPredicate(), s.getObject());
91 });
92
93 }
94
95 }
96
97 @Override
98 public ExtractorDescription getDescription() {
99 return YAMLExtractorFactory.getDescriptionInstance();
100 }
101
102 }