1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.rdf;
19
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.PushbackInputStream;
23 import java.util.Arrays;
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40 class JsonCleaningInputStream extends InputStream {
41
42 private static final int EOL_COMMENT = 1;
43 private static final int MULTILINE_COMMENT = 2;
44
45 private static final int NEEDS_COMMA = -1;
46 private static final int NEEDS_COMMA_AND_NEWLINE = 1;
47
48 private boolean inEscape;
49 private boolean inCDATA;
50 private int needsComma;
51 private int currentState;
52
53 private static final int MAX_BLANK_PUSHBACK = 128;
54 private static final byte[] BLANK_PUSHBACK = new byte[MAX_BLANK_PUSHBACK];
55
56 static {
57 Arrays.fill(BLANK_PUSHBACK, (byte) ' ');
58 BLANK_PUSHBACK[0] = '\n';
59 }
60
61 private final PushbackInputStream in;
62
63 JsonCleaningInputStream(InputStream in) {
64 this.in = new PushbackInputStream(in, 256);
65 }
66
67 private static void unread(PushbackInputStream in, int c) throws IOException {
68 if (c != -1) {
69 in.unread(c);
70 }
71 }
72
73 private static boolean isNextOrUnread(PushbackInputStream in, int... next) throws IOException {
74 int i = -1;
75 for (int test : next) {
76 int c = in.read();
77 if (c != test) {
78 unread(in, c);
79 while (i >= 0) {
80 in.unread(next[i--]);
81 }
82 return false;
83 }
84 i++;
85 }
86 return true;
87 }
88
89 @Override
90 public int read() throws IOException {
91 PushbackInputStream in = this.in;
92
93 for (;;) {
94 int c = in.read();
95
96 if (c == -1) {
97 return c;
98 }
99
100 if (inCDATA) {
101 if (c == ']' && isNextOrUnread(in, ']', '>')) {
102 inCDATA = false;
103 continue;
104 }
105 } else {
106 if (c == '<' && isNextOrUnread(in, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) {
107 inCDATA = true;
108 continue;
109 }
110 }
111
112 int ctx = currentState;
113 switch (ctx) {
114 case 0:
115 break;
116 case EOL_COMMENT:
117 if (c == '\r' || c == '\n') {
118
119 currentState = 0;
120 if (needsComma != 0) {
121 needsComma = NEEDS_COMMA_AND_NEWLINE;
122 continue;
123 }
124 return c;
125 }
126 continue;
127 case MULTILINE_COMMENT:
128 if (c == '\r' || c == '\n') {
129 if (needsComma != 0) {
130 needsComma = NEEDS_COMMA_AND_NEWLINE;
131 continue;
132 }
133 return c;
134 } else if (c == '*' && isNextOrUnread(in, '/')) {
135
136 currentState = 0;
137 }
138 continue;
139 default:
140
141 if (inEscape) {
142
143 inEscape = false;
144 } else if (c == '\\') {
145
146 inEscape = true;
147 } else if (c == ctx) {
148
149 currentState = 0;
150 return '"';
151 }
152 return c;
153 }
154
155
156
157 $whitespace: {
158 switch (c) {
159 case '#':
160 currentState = EOL_COMMENT;
161 continue;
162 case '/':
163 int next = in.read();
164 if (next == '/') {
165 currentState = EOL_COMMENT;
166 continue;
167 } else if (next == '*') {
168 currentState = MULTILINE_COMMENT;
169 continue;
170 }
171 unread(in, next);
172 break;
173 case ',':
174 case ';':
175
176 needsComma = NEEDS_COMMA;
177 continue;
178 case '}':
179 case ']':
180
181
182 needsComma = NEEDS_COMMA;
183 return c;
184 case '\r':
185 case '\n':
186 if (needsComma != 0) {
187 needsComma = NEEDS_COMMA_AND_NEWLINE;
188 continue;
189 }
190 return c;
191
192 case 0x09:
193 case 0x0b:
194 case 0x0c:
195 case 0x1c:
196 case 0x1d:
197 case 0x1e:
198 case 0x1f:
199 case 0x20:
200 break $whitespace;
201 case 0xc2:
202 if (isNextOrUnread(in, 0xa0)) {
203 break $whitespace;
204 }
205 break;
206 case 0xe1:
207 if (isNextOrUnread(in, 0x9a, 0x80) || isNextOrUnread(in, 0xa0, 0x8e)) {
208 break $whitespace;
209 }
210 break;
211 case 0xe2:
212 int c1 = in.read();
213 if (c1 == 0x80) {
214 int c2 = in.read();
215
216 if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf
217
218 || c2 == 0xa8 || c2 == 0xa9) {
219 break $whitespace;
220 }
221 unread(in, c2);
222 in.unread(0x80);
223 } else if (c1 == 0x81) {
224 int c2 = in.read();
225 if (c2 == 0x9f) {
226 break $whitespace;
227 }
228 unread(in, c2);
229 in.unread(0x81);
230 } else {
231 unread(in, c1);
232 }
233 break;
234 case 0xe3:
235 if (isNextOrUnread(in, 0x80, 0x80)) {
236 break $whitespace;
237 }
238 break;
239 default:
240 break;
241 }
242
243
244
245 int nc = needsComma;
246 if (nc != 0) {
247 in.unread(c);
248 if (nc == NEEDS_COMMA) {
249 in.unread(' ');
250 } else {
251 in.unread(BLANK_PUSHBACK, 0, nc);
252 }
253 needsComma = 0;
254 return ',';
255 } else if (c == '"' || c == '\'') {
256 currentState = c;
257 return '"';
258 }
259 return c;
260 }
261
262
263
264 int nc = needsComma;
265 if (nc != 0) {
266 if (nc != NEEDS_COMMA && nc != MAX_BLANK_PUSHBACK) {
267 needsComma = nc + 1;
268 }
269 continue;
270 }
271
272 return ' ';
273
274 }
275
276 }
277 }