1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.ExtractorDescription;
25 import org.apache.any23.vocab.XHTML;
26 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
27 import org.eclipse.rdf4j.model.IRI;
28 import org.eclipse.rdf4j.model.ValueFactory;
29 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
30 import org.w3c.dom.Document;
31 import org.w3c.dom.Node;
32
33 import java.io.IOException;
34 import java.util.List;
35
36
37
38
39
40 public class HeadLinkExtractor implements TagSoupDOMExtractor {
41
42 public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
43 ExtractionResult out) throws IOException, ExtractionException {
44 HTMLDocument html = new HTMLDocument(in);
45 ValueFactory vf = SimpleValueFactory.getInstance();
46
47 final List<Node> headLinkNodes = DomUtils.findAll(in,
48 "/HTML/HEAD/LINK[(" + "@type='application/rdf+xml' or " + "@type='text/rdf' or "
49 + "@type='application/x-turtle' or " + "@type='application/turtle' or "
50 + "@type='text/turtle' or " + "@type='text/rdf+n3'" + ") and @href and @rel]");
51 for (Node node : headLinkNodes) {
52 final IRI href = html.resolveIRI(DomUtils.find(node, "@href"));
53 final String rel = DomUtils.find(node, "@rel");
54 out.writeTriple(extractionContext.getDocumentIRI(), vf.createIRI(XHTML.NS + rel), href);
55 final String title = DomUtils.find(node, "@title");
56 if (title != null && !"".equals(title)) {
57 out.writeTriple(href, getDescription().getPrefixes().expand("dcterms:title"), vf.createLiteral(title));
58 }
59 final String type = DomUtils.find(node, "@type");
60 if (type != null && !"".equals(type)) {
61 out.writeTriple(href, getDescription().getPrefixes().expand("dcterms:format"), vf.createLiteral(type));
62 }
63 }
64 }
65
66 @Override
67 public ExtractorDescription getDescription() {
68 return HeadLinkExtractorFactory.getDescriptionInstance();
69 }
70
71 }