1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import java.io.IOException;
21 import java.io.InputStream;
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 public class SpanCloserInputStream extends InputStream {
39
40 private static final String TRAILING_SEQUENCE_OPEN = "<span";
41 private static final char TRAILING_SEQUENCE_CLOSE = '>';
42 private static final String CLOSE_SEQUENCE = "</span>";
43
44 private final InputStream wrapped;
45
46 private int trailingSequenceOpenMatch = 0;
47 private int closeSequenceIndex = 0;
48 private boolean trailingSequenceOpenDetected = false;
49 private boolean trailingSequenceCloseDetected = false;
50 private boolean inlineDetected = false;
51 private boolean betweenQuotes = false;
52
53 public SpanCloserInputStream(InputStream is) {
54 wrapped = is;
55 }
56
57 @Override
58 public int read() throws IOException {
59 if (trailingSequenceOpenDetected && inlineDetected && trailingSequenceCloseDetected) {
60 final int ret = CLOSE_SEQUENCE.charAt(closeSequenceIndex);
61 closeSequenceIndex++;
62 if (closeSequenceIndex >= CLOSE_SEQUENCE.length()) {
63 resetDetector();
64 }
65 return ret;
66 } else if (trailingSequenceOpenDetected && trailingSequenceCloseDetected) {
67 resetDetector();
68 }
69
70 int c = wrapped.read();
71 if (c == '"') {
72 betweenQuotes = !betweenQuotes;
73 } else if (c == '/' && !betweenQuotes && trailingSequenceOpenDetected && !trailingSequenceCloseDetected) {
74 inlineDetected = true;
75 c = wrapped.read();
76 }
77
78 if (!trailingSequenceOpenDetected && checkOpenTrailingSequence(c)) {
79 trailingSequenceOpenDetected = true;
80 trailingSequenceCloseDetected = false;
81 } else if (c == TRAILING_SEQUENCE_CLOSE && trailingSequenceOpenDetected) {
82 trailingSequenceCloseDetected = true;
83 }
84 return c;
85 }
86
87 private boolean checkOpenTrailingSequence(int c) {
88 if (TRAILING_SEQUENCE_OPEN.charAt(trailingSequenceOpenMatch) == Character.toLowerCase(c)) {
89 trailingSequenceOpenMatch++;
90 if (trailingSequenceOpenMatch == TRAILING_SEQUENCE_OPEN.length()) {
91 trailingSequenceOpenMatch = 0;
92 return true;
93 }
94 } else {
95 trailingSequenceOpenMatch = 0;
96 }
97 return false;
98 }
99
100 private void resetDetector() {
101 trailingSequenceOpenMatch = 0;
102 closeSequenceIndex = 0;
103 trailingSequenceOpenDetected = false;
104 trailingSequenceCloseDetected = false;
105 inlineDetected = false;
106 betweenQuotes = false;
107 }
108
109 }