1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.ExtractorDescription;
25 import org.apache.any23.extractor.IssueReport;
26 import org.apache.any23.vocab.XHTML;
27 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
28 import org.eclipse.rdf4j.model.IRI;
29 import org.w3c.dom.Document;
30 import org.w3c.dom.Node;
31
32 import java.io.IOException;
33 import java.util.Locale;
34
35
36
37
38
39
40
41 public class LicenseExtractor implements TagSoupDOMExtractor {
42
43 private static final XHTML vXHTML = XHTML.getInstance();
44
45 @Override
46 public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
47 ExtractionResult out) throws IOException, ExtractionException {
48 HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument document = new HTMLDocument(in);
49 final IRI documentIRI = extractionContext.getDocumentIRI();
50 for (Node node : DomUtils.findAll(in, "//A[@rel='license']/@href")) {
51 String link = node.getNodeValue();
52 if ("".equals(link)) {
53 out.notifyIssue(IssueReport.IssueLevel.WARNING, String.format(Locale.ROOT,
54 "Invalid license link detected within document %s.", documentIRI.toString()), 0, 0);
55 continue;
56 }
57 out.writeTriple(documentIRI, vXHTML.license, document.resolveIRI(link));
58 }
59 }
60
61 @Override
62 public ExtractorDescription getDescription() {
63 return LicenseExtractorFactory.getDescriptionInstance();
64 }
65
66 }