1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.csv;
19
20 import org.apache.any23.configuration.DefaultConfiguration;
21 import org.apache.commons.csv.CSVParser;
22 import org.apache.commons.csv.CSVFormat;
23 import org.apache.commons.csv.CSVRecord;
24
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.InputStreamReader;
28 import java.nio.charset.StandardCharsets;
29 import java.util.Iterator;
30
31
32
33
34
35
36
37
38 public class CSVReaderBuilder {
39
40 private static final String DEFAULT_FIELD_DELIMITER = ",";
41
42 private static final String DEFAULT_COMMENT_DELIMITER = "#";
43
44 private static final char[] popularDelimiters = { '\t', '|', ',', ';' };
45
46 private static DefaultConfiguration defaultConfiguration = DefaultConfiguration.singleton();
47
48 private static final CSVFormat[] strategies;
49
50 static {
51 strategies = new CSVFormat[popularDelimiters.length + 1];
52 strategies[0] = CSVFormat.DEFAULT;
53 int index = 1;
54 for (char dlmt : popularDelimiters) {
55 strategies[index++] = CSVFormat.DEFAULT.withDelimiter(dlmt);
56 }
57 }
58
59
60
61
62
63
64
65
66
67
68
69
70
71 public static CSVParser build(InputStream is) throws IOException {
72 CSVFormat bestStrategy = getBestStrategy(is);
73 if (bestStrategy == null)
74 bestStrategy = getCSVStrategyFromConfiguration();
75 return new CSVParser(new InputStreamReader(is, StandardCharsets.UTF_8), bestStrategy);
76 }
77
78
79
80
81
82
83
84
85
86
87
88
89
90 public static boolean isCSV(InputStream is) throws IOException {
91 return getBestStrategy(is) != null;
92 }
93
94 private static CSVFormat getBestStrategy(InputStream is) throws IOException {
95 for (CSVFormat strategy : strategies) {
96 if (testStrategy(is, strategy)) {
97 return strategy;
98 }
99 }
100 return null;
101 }
102
103 private static CSVFormat getCSVStrategyFromConfiguration() {
104 char fieldDelimiter = getCharValueFromConfiguration("any23.extraction.csv.field", DEFAULT_FIELD_DELIMITER);
105 char commentDelimiter = getCharValueFromConfiguration("any23.extraction.csv.comment",
106 DEFAULT_COMMENT_DELIMITER);
107 return CSVFormat.DEFAULT.withDelimiter(fieldDelimiter).withCommentMarker(commentDelimiter);
108 }
109
110 private static char getCharValueFromConfiguration(String property, String defaultValue) {
111 String delimiter = defaultConfiguration.getProperty(property, defaultValue);
112 if (delimiter.length() != 1) {
113 throw new RuntimeException(property + " value must be a single character");
114 }
115 return delimiter.charAt(0);
116 }
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133 private static boolean testStrategy(InputStream is, CSVFormat strategy) throws IOException {
134 final int MIN_COLUMNS = 2;
135
136 is.mark(Integer.MAX_VALUE);
137 try {
138 @SuppressWarnings("resource")
139 final Iterator<CSVRecord> rows = new CSVParser(new InputStreamReader(is, StandardCharsets.UTF_8), strategy)
140 .iterator();
141 int linesToCheck = 5;
142 int headerColumnCount = -1;
143 while (linesToCheck > 0 && rows.hasNext()) {
144 int rowLength = rows.next().size();
145 if (rowLength < MIN_COLUMNS) {
146 return false;
147 }
148 if (headerColumnCount == -1) {
149 headerColumnCount = rowLength;
150 } else {
151 if (rowLength < headerColumnCount) {
152 return false;
153 } else if (rowLength - 1 > headerColumnCount) {
154 return false;
155 }
156 }
157 linesToCheck--;
158 }
159 return true;
160 } finally {
161 is.reset();
162 }
163 }
164
165 }