1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.rdfa;
19
20 import org.apache.commons.lang3.ArrayUtils;
21 import org.jsoup.nodes.CDataNode;
22 import org.jsoup.nodes.Comment;
23 import org.jsoup.nodes.Element;
24 import org.jsoup.nodes.Node;
25 import org.jsoup.nodes.TextNode;
26 import org.jsoup.select.NodeVisitor;
27 import org.semarglproject.sink.XmlSink;
28 import org.xml.sax.SAXException;
29 import org.xml.sax.helpers.AttributesImpl;
30 import org.xml.sax.helpers.NamespaceSupport;
31
32 import java.util.ArrayList;
33
34
35
36
37 class JsoupScanner implements NodeVisitor {
38
39 private final NamespaceSupport ns = new NamespaceSupport();
40 private final AttributesImpl attrs = new AttributesImpl();
41 private final String[] nameParts = new String[3];
42
43 private final XmlSink handler;
44
45 JsoupScanner(XmlSink handler) {
46 this.handler = handler;
47 }
48
49 private static String orEmpty(String str) {
50 return str == null ? "" : str;
51 }
52
53 private static final String[] commonHashDelimitedVocabs = { "http://creativecommons.org/ns",
54 "http://www.w3.org/2002/07/owl", "http://www.w3.org/1999/02/22-rdf-syntax-ns", "http://www.w3.org/ns/rdfa",
55 "http://www.w3.org/2000/01/rdf-schema", "http://www.w3.org/1999/xhtml/vocab",
56 "http://www.w3.org/2001/XMLSchema", "http://microformats.org/profile/hcard",
57 "http://www.w3.org/2006/vcard/ns", "http://ogp.me/ns", "http://ogp.me/ns/music", "http://ogp.me/ns/video",
58 "http://ogp.me/ns/article", "http://ogp.me/ns/book", "http://ogp.me/ns/profile",
59 "http://ogp.me/ns/website" };
60
61 private void startElement(Element e) throws SAXException {
62 ns.pushContext();
63
64 attrs.clear();
65 final ArrayList<String> remainingAttrs = new ArrayList<>();
66 for (org.jsoup.nodes.Attribute attr : e.attributes()) {
67 String name = attr.getKey();
68 String value = attr.getValue();
69 if (name.startsWith("xmlns")) {
70 if (name.length() == 5) {
71 ns.declarePrefix("", value);
72 handler.startPrefixMapping("", value);
73 continue;
74 } else if (name.charAt(5) == ':') {
75 String localName = name.substring(6);
76 ns.declarePrefix(localName, value);
77 handler.startPrefixMapping(localName, value);
78 continue;
79 }
80 } else if (name.equalsIgnoreCase("vocab")) {
81
82 name = "vocab";
83 value = value.trim();
84 int len = value.length();
85 char lastChar;
86 if (len != 0 && (lastChar = value.charAt(len - 1)) != '/' && lastChar != '#' && lastChar != ':') {
87 if (ArrayUtils.contains(commonHashDelimitedVocabs, value)) {
88 value += "#";
89 } else {
90 value += "/";
91 }
92 }
93 }
94
95 remainingAttrs.add(name);
96 remainingAttrs.add(value);
97 }
98
99 for (int i = 0, len = remainingAttrs.size(); i < len; i += 2) {
100 String name = remainingAttrs.get(i);
101 String value = remainingAttrs.get(i + 1);
102 String[] parts = ns.processName(name, nameParts, true);
103 if (parts != null) {
104 attrs.addAttribute(orEmpty(parts[0]), orEmpty(parts[1]), parts[2], "CDATA", value);
105 }
106 }
107
108 String qName = e.tagName();
109
110 String[] parts = ns.processName(qName, nameParts, false);
111 if (parts == null) {
112 handler.startElement("", "", qName, attrs);
113 } else {
114 handler.startElement(orEmpty(parts[0]), orEmpty(parts[1]), parts[2], attrs);
115 }
116
117 }
118
119 private void endElement(Element e) throws SAXException {
120
121 String qName = e.tagName();
122 String[] parts = ns.processName(qName, nameParts, false);
123 if (parts == null) {
124 handler.endElement("", "", qName);
125 } else {
126 handler.endElement(orEmpty(parts[0]), orEmpty(parts[1]), parts[2]);
127 }
128
129 for (org.jsoup.nodes.Attribute attr : e.attributes()) {
130 String name = attr.getKey();
131 if (name.startsWith("xmlns")) {
132 if (name.length() == 5) {
133 handler.endPrefixMapping("");
134 } else if (name.charAt(5) == ':') {
135 String localName = name.substring(6);
136 handler.endPrefixMapping(localName);
137 }
138 }
139 }
140
141 ns.popContext();
142 }
143
144 private void handleText(String str) throws SAXException {
145 handler.characters(str.toCharArray(), 0, str.length());
146 }
147
148 private void handleComment(String str) throws SAXException {
149 handler.comment(str.toCharArray(), 0, str.length());
150 }
151
152 @Override
153 public void head(Node node, int depth) {
154 try {
155 if (node instanceof Element) {
156 startElement((Element) node);
157 } else if (node instanceof CDataNode) {
158 handler.startCDATA();
159 handleText(((CDataNode) node).text());
160 } else if (node instanceof TextNode) {
161 handleText(((TextNode) node).text());
162
163
164
165
166 } else if (node instanceof Comment) {
167 handleComment(((Comment) node).getData());
168 }
169 } catch (SAXException e) {
170 sneakyThrow(e);
171 }
172 }
173
174 @Override
175 public void tail(Node node, int depth) {
176 try {
177 if (node instanceof Element) {
178 endElement((Element) node);
179 } else if (node instanceof CDataNode) {
180 handler.endCDATA();
181
182
183
184 }
185 } catch (SAXException e) {
186 sneakyThrow(e);
187 }
188 }
189
190 @SuppressWarnings("unchecked")
191 private static <E extends Throwable> void sneakyThrow(Throwable e) throws E {
192 throw (E) e;
193 }
194 }