1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.jsoup.nodes.Attribute;
21 import org.jsoup.select.NodeTraversor;
22 import org.jsoup.select.NodeVisitor;
23 import org.w3c.dom.Comment;
24 import org.w3c.dom.Document;
25 import org.w3c.dom.Text;
26
27 import java.io.IOException;
28 import java.io.InputStream;
29
30
31
32
33
34
35 abstract class TagSoupParsingConfiguration {
36
37 String name() {
38 return getClass().getSimpleName();
39 }
40
41 abstract Document parse(InputStream input, String documentIRI, String encoding) throws IOException;
42
43 static TagSoupParsingConfiguration getDefault() {
44 return JsoupConfig.instance;
45 }
46
47 private static class JsoupConfig extends TagSoupParsingConfiguration {
48
49 private static final JsoupConfig instance = new JsoupConfig();
50
51 @Override
52 Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
53
54 org.jsoup.nodes.Document document = JsoupUtils.parse(input, documentIRI, encoding);
55
56 return convert(document);
57 }
58
59 private static Document convert(org.jsoup.nodes.Document document) {
60 Document w3cDoc = new org.apache.html.dom.HTMLDocumentImpl();
61
62 org.jsoup.nodes.Element rootEl = document.children().first();
63 if (rootEl != null) {
64 NodeTraversor.traverse(new DocumentConverter(w3cDoc), rootEl);
65 }
66
67 return w3cDoc;
68 }
69
70 private static class DocumentConverter implements NodeVisitor {
71
72 private final Document doc;
73 private org.w3c.dom.Element dest;
74
75 DocumentConverter(Document doc) {
76 this.doc = doc;
77 }
78
79 @Override
80 public void head(org.jsoup.nodes.Node source, int depth) {
81 if (source instanceof org.jsoup.nodes.Element) {
82 org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
83
84 org.w3c.dom.Element el = doc.createElement(sourceEl.tagName());
85 copyAttributes(sourceEl, el);
86 if (dest == null) {
87 doc.appendChild(el);
88 } else {
89 dest.appendChild(el);
90 }
91 dest = el;
92 } else if (source instanceof org.jsoup.nodes.TextNode) {
93 org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
94 Text text = doc.createTextNode(sourceText.getWholeText());
95 dest.appendChild(text);
96 } else if (source instanceof org.jsoup.nodes.Comment) {
97 org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
98 Comment comment = doc.createComment(sourceComment.getData());
99 dest.appendChild(comment);
100 } else if (source instanceof org.jsoup.nodes.DataNode) {
101 org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
102 Text node = doc.createTextNode(stripCDATA(sourceData.getWholeData()));
103 dest.appendChild(node);
104 }
105 }
106
107 @Override
108 public void tail(org.jsoup.nodes.Node source, int depth) {
109 if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof org.w3c.dom.Element) {
110 dest = (org.w3c.dom.Element) dest.getParentNode();
111 }
112 }
113
114 private void copyAttributes(org.jsoup.nodes.Node source, org.w3c.dom.Element el) {
115 for (Attribute attribute : source.attributes()) {
116
117 String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
118 if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*"))
119 el.setAttribute(key, attribute.getValue());
120 }
121 }
122 }
123
124 private static String stripCDATA(String string) {
125 return reduceToContent(string, "<![CDATA[", "]]>");
126 }
127
128 private static String reduceToContent(String string, String startMarker, String endMarker) {
129 int i = 0;
130 int startContent = -1;
131 int l1 = startMarker.length();
132
133 int l2;
134 char c;
135 for (l2 = endMarker.length(); i < string.length() - l1 - l2; ++i) {
136 c = string.charAt(i);
137 if (!Character.isWhitespace(c)) {
138 if (c == startMarker.charAt(0) && startMarker.equals(string.substring(i, l1 + i))) {
139 startContent = i + l1;
140 break;
141 }
142
143 return string;
144 }
145 }
146
147 if (startContent != -1) {
148 for (i = string.length() - 1; i > startContent + l2; --i) {
149 c = string.charAt(i);
150 if (!Character.isWhitespace(c)) {
151 if (c == endMarker.charAt(l2 - 1) && endMarker.equals(string.substring(i - l2 + 1, i + 1))) {
152
153 return string.substring(startContent, i - 2);
154 }
155
156 return string;
157 }
158 }
159
160 }
161 return string;
162 }
163
164 }
165
166 }