1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.jsoup.Jsoup;
21 import org.jsoup.nodes.Document;
22 import org.jsoup.parser.Parser;
23
24 import java.io.ByteArrayInputStream;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.SequenceInputStream;
28 import java.nio.charset.StandardCharsets;
29 import java.util.Arrays;
30
31
32
33
34 public class JsoupUtils {
35
36 public static Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
37
38 if (documentIRI == null) {
39 documentIRI = "";
40 }
41
42
43 if (encoding == null) {
44
45 int c;
46 do {
47 c = input.read();
48 } while (c != -1 && Character.isWhitespace(c));
49
50 if (c != -1) {
51 int capacity = 256;
52 byte[] bytes = new byte[capacity];
53 int length = 0;
54 bytes[length++] = (byte) c;
55
56 if (c == '<') {
57 c = input.read();
58 if (c != -1) {
59 bytes[length++] = (byte) c;
60 if (c == '?') {
61 c = input.read();
62
63 while (c != -1) {
64 if (length == capacity) {
65 capacity *= 2;
66 bytes = Arrays.copyOf(bytes, capacity);
67 }
68 bytes[length++] = (byte) c;
69
70 if (c == '>') {
71 if (length >= 20 && bytes[length - 2] == '?') {
72 String decl = "<" + new String(bytes, 2, length - 4, StandardCharsets.UTF_8)
73 + ">";
74 org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(decl, documentIRI,
75 Parser.xmlParser());
76 for (org.jsoup.nodes.Element el : doc.children()) {
77 if ("xml".equalsIgnoreCase(el.tagName())) {
78 String enc = el.attr("encoding");
79 if (enc != null && !enc.isEmpty()) {
80 encoding = enc;
81 break;
82 }
83 }
84 }
85 }
86 break;
87 }
88
89 c = input.read();
90 }
91 }
92 }
93
94 }
95
96 input = new SequenceInputStream(new ByteArrayInputStream(bytes, 0, length), input);
97 }
98
99 }
100
101
102 return Jsoup.parse(input, encoding, documentIRI, Parser.htmlParser());
103 }
104
105 }