1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.rdfa;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.IssueReport;
25 import org.apache.any23.extractor.rdf.BaseRDFExtractor;
26 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
27 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
28 import org.eclipse.rdf4j.rio.helpers.RDFaParserSettings;
29 import org.jsoup.Jsoup;
30 import org.jsoup.nodes.Document;
31 import org.jsoup.parser.ParseSettings;
32 import org.jsoup.parser.Parser;
33 import org.semarglproject.rdf.rdfa.RdfaParser;
34 import org.semarglproject.rdf4j.rdf.rdfa.SemarglParserSettings;
35 import org.semarglproject.sink.XmlSink;
36 import org.semarglproject.source.StreamProcessor;
37
38 import java.io.IOException;
39 import java.io.InputStream;
40 import java.io.PrintWriter;
41 import java.io.StringWriter;
42
43
44
45
46 abstract class BaseRDFaExtractor extends BaseRDFExtractor {
47
48 private final short version;
49
50 BaseRDFaExtractor(short version) {
51 super(false, false);
52 this.version = version;
53 }
54
55 @Override
56 public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream in,
57 ExtractionResult extractionResult) throws IOException, ExtractionException {
58
59 SemarglSink rdfaSink = new SemarglSink(extractionResult, new Any23ValueFactoryWrapper(
60 SimpleValueFactory.getInstance(), extractionResult, extractionContext.getDefaultLanguage()));
61
62 XmlSink xmlSink = RdfaParser.connect(rdfaSink);
63 xmlSink.setProperty(StreamProcessor.PROCESSOR_GRAPH_HANDLER_PROPERTY, rdfaSink);
64 xmlSink.setProperty(RdfaParser.RDFA_VERSION_PROPERTY, version);
65 xmlSink.setProperty(RdfaParser.ENABLE_VOCAB_EXPANSION,
66 RDFaParserSettings.VOCAB_EXPANSION_ENABLED.getDefaultValue());
67 xmlSink.setProperty(RdfaParser.ENABLE_PROCESSOR_GRAPH,
68 SemarglParserSettings.PROCESSOR_GRAPH_ENABLED.getDefaultValue());
69
70 String baseUri = extractionContext.getDocumentIRI().stringValue();
71 xmlSink.setBaseUri(baseUri);
72 Document doc = Jsoup.parse(in, null, baseUri, Parser.htmlParser().settings(ParseSettings.preserveCase));
73 try {
74 xmlSink.startDocument();
75 doc.traverse(new JsoupScanner(xmlSink));
76 xmlSink.endDocument();
77 } catch (Exception e) {
78 extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, toString(e), -1, -1);
79 }
80 }
81
82 @SuppressWarnings("Duplicates")
83 private static String toString(Throwable th) {
84 StringWriter writer = new StringWriter();
85 try (PrintWriter pw = new PrintWriter(writer)) {
86 th.printStackTrace(pw);
87 }
88 String string = writer.toString();
89 if (string.length() > 1024) {
90 return string.substring(0, 1021) + "...";
91 }
92 return string;
93 }
94
95 }