1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.IssueReport;
21 import org.apache.any23.extractor.ExtractionContext;
22 import org.apache.any23.extractor.ExtractionException;
23 import org.apache.any23.extractor.ExtractionParameters;
24 import org.apache.any23.extractor.ExtractionResult;
25 import org.apache.any23.extractor.Extractor;
26 import org.apache.any23.extractor.ExtractorDescription;
27 import org.apache.any23.extractor.rdf.RDFParserFactory;
28 import org.eclipse.rdf4j.model.IRI;
29 import org.eclipse.rdf4j.rio.RDFParseException;
30 import org.eclipse.rdf4j.rio.RDFParser;
31 import org.w3c.dom.Document;
32 import org.w3c.dom.Node;
33
34 import java.io.IOException;
35 import java.io.StringReader;
36 import java.util.Arrays;
37 import java.util.List;
38 import java.util.Locale;
39
40
41
42
43
44
45
46
47 public class TurtleHTMLExtractor implements Extractor.TagSoupDOMExtractor {
48
49 private RDFParser turtleParser;
50
51 @Override
52 public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
53 ExtractionResult out) throws IOException, ExtractionException {
54 List<Node> scriptNodes;
55 HTMLDocumentocument.html#HTMLDocument">HTMLDocument htmlDocument = new HTMLDocument(in);
56 final IRI documentIRI = extractionContext.getDocumentIRI();
57
58 scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/turtle')]");
59 processScriptNodes(documentIRI, extractionContext, out, scriptNodes);
60
61 scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/n3')]");
62 processScriptNodes(documentIRI, extractionContext, out, scriptNodes);
63
64 scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/plain')]");
65 processScriptNodes(documentIRI, extractionContext, out, scriptNodes);
66 }
67
68 @Override
69 public ExtractorDescription getDescription() {
70 return TurtleHTMLExtractorFactory.getDescriptionInstance();
71 }
72
73
74
75
76
77
78
79
80
81
82
83 private void processScriptNodes(IRI documentIRI, ExtractionContext ec, ExtractionResult er, List<Node> ns) {
84 if (ns.size() > 0 && turtleParser == null) {
85 turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er);
86 }
87 for (Node n : ns) {
88 processScriptNode(turtleParser, documentIRI, n, er);
89 }
90 }
91
92
93
94
95
96
97
98
99
100
101
102
103
104 private void processScriptNode(RDFParser turtleParser, IRI documentIRI, Node n, ExtractionResult er) {
105 final Node idAttribute = n.getAttributes().getNamedItem("id");
106 final String graphName = documentIRI.stringValue()
107 + (idAttribute == null ? "" : "#" + idAttribute.getTextContent());
108 try {
109 turtleParser.parse(new StringReader(n.getTextContent()), graphName);
110 } catch (RDFParseException rdfpe) {
111 er.notifyIssue(IssueReport.IssueLevel.ERROR,
112 String.format(Locale.ROOT, "An error occurred while parsing turtle content within script node: %s",
113 Arrays.toString(DomUtils.getXPathListForNode(n))),
114 rdfpe.getLineNumber(), rdfpe.getColumnNumber());
115 } catch (Exception e) {
116 er.notifyIssue(IssueReport.IssueLevel.ERROR, "An error occurred while processing RDF data.", -1, -1);
117 }
118 }
119
120 }