1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.csv;
19
20 import static java.lang.Character.toUpperCase;
21
22 import org.apache.any23.extractor.ExtractionContext;
23 import org.apache.any23.extractor.ExtractionException;
24 import org.apache.any23.extractor.ExtractionParameters;
25 import org.apache.any23.extractor.ExtractionResult;
26 import org.apache.any23.extractor.Extractor;
27 import org.apache.any23.extractor.ExtractorDescription;
28 import org.apache.any23.rdf.RDFUtils;
29 import org.apache.any23.vocab.CSV;
30 import org.apache.commons.csv.CSVParser;
31 import org.apache.commons.csv.CSVRecord;
32 import org.eclipse.rdf4j.model.IRI;
33 import org.eclipse.rdf4j.model.Value;
34 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
35 import org.eclipse.rdf4j.model.vocabulary.RDF;
36 import org.eclipse.rdf4j.model.vocabulary.RDFS;
37 import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
38
39 import java.io.IOException;
40 import java.io.InputStream;
41 import java.util.StringTokenizer;
42 import java.util.Iterator;
43 import java.util.Locale;
44
45
46
47
48
49
50
51
52
53 public class CSVExtractor implements Extractor.ContentExtractor {
54
55 private CSVParser csvParser;
56
57 private IRI[] headerIRIs;
58
59 private CSV csv = CSV.getInstance();
60
61
62
63
64 @Override
65 public void setStopAtFirstError(boolean f) {
66
67 }
68
69
70
71
72 @Override
73 public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream in,
74 ExtractionResult out) throws IOException, ExtractionException {
75 final IRI documentIRI = extractionContext.getDocumentIRI();
76
77
78 csvParser = CSVReaderBuilder.build(in);
79 Iterator<CSVRecord> rows = csvParser.iterator();
80
81
82 CSVRecord header = rows.hasNext() ? rows.next() : null;
83 headerIRIs = processHeader(header, documentIRI);
84
85
86 writeHeaderPropertiesMetadata(header, out);
87
88 int index = 0;
89 while (rows.hasNext()) {
90 CSVRecord nextLine = rows.next();
91 IRI rowSubject = RDFUtils.iri(documentIRI.toString(), "row/" + index);
92
93 out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
94
95 produceRowStatements(rowSubject, nextLine, out);
96
97 out.writeTriple(documentIRI, csv.row, rowSubject);
98
99 out.writeTriple(rowSubject, csv.rowPosition,
100 SimpleValueFactory.getInstance().createLiteral(String.valueOf(index)));
101 index++;
102 }
103
104 addTableMetadataStatements(documentIRI, out, index, headerIRIs.length);
105 }
106
107
108
109
110
111
112
113
114 private boolean isInteger(String number) {
115 try {
116 Integer.valueOf(number);
117 return true;
118 } catch (NumberFormatException e) {
119 return false;
120 }
121 }
122
123
124
125
126
127
128
129
130 private boolean isFloat(String number) {
131 try {
132 Float.valueOf(number);
133 return true;
134 } catch (NumberFormatException e) {
135 return false;
136 }
137 }
138
139
140
141
142
143
144
145 private void writeHeaderPropertiesMetadata(CSVRecord header, ExtractionResult out) {
146 int index = 0;
147 for (IRI singleHeader : headerIRIs) {
148 if (index > headerIRIs.length) {
149 break;
150 }
151 String headerString = header.get(index);
152 if (!RDFUtils.isAbsoluteIRI(headerString)) {
153 out.writeTriple(singleHeader, RDFS.LABEL, SimpleValueFactory.getInstance().createLiteral(headerString));
154 }
155 out.writeTriple(singleHeader, csv.columnPosition,
156 SimpleValueFactory.getInstance().createLiteral(String.valueOf(index), XMLSchema.INTEGER));
157 index++;
158 }
159 }
160
161
162
163
164
165
166
167
168
169
170 private IRI[] processHeader(CSVRecord header, IRI documentIRI) {
171 if (header == null)
172 return new IRI[0];
173
174 IRI[] result = new IRI[header.size()];
175 int index = 0;
176 for (String h : header) {
177 String candidate = h.trim();
178 if (RDFUtils.isAbsoluteIRI(candidate)) {
179 result[index] = SimpleValueFactory.getInstance().createIRI(candidate);
180 } else {
181 result[index] = normalize(candidate, documentIRI);
182 }
183 index++;
184 }
185 return result;
186 }
187
188 private IRI normalize(String toBeNormalized, IRI documentIRI) {
189 String newToBeNormalized = toBeNormalized.trim().toLowerCase(Locale.ROOT).replace("?", "").replace("&", "");
190
191 StringBuilder result = new StringBuilder(documentIRI.toString());
192
193 StringTokenizer tokenizer = new StringTokenizer(newToBeNormalized, " ");
194 while (tokenizer.hasMoreTokens()) {
195 String current = tokenizer.nextToken();
196
197 result.append(toUpperCase(current.charAt(0))).append(current.substring(1));
198 }
199
200 return SimpleValueFactory.getInstance().createIRI(result.toString());
201 }
202
203
204
205
206
207
208
209
210
211 private void produceRowStatements(IRI rowSubject, CSVRecord values, ExtractionResult out) {
212 int index = 0;
213 for (String cell : values) {
214 if (index >= headerIRIs.length) {
215
216 break;
217 }
218 if ("".equals(cell)) {
219 index++;
220 continue;
221 }
222 IRI predicate = headerIRIs[index];
223 Value object = getObjectFromCell(cell);
224 out.writeTriple(rowSubject, predicate, object);
225 index++;
226 }
227 }
228
229 private Value getObjectFromCell(String cell) {
230 Value object;
231 String newCell = cell.trim();
232 if (RDFUtils.isAbsoluteIRI(newCell)) {
233 object = SimpleValueFactory.getInstance().createIRI(newCell);
234 } else {
235 IRI datatype = XMLSchema.STRING;
236 if (isInteger(newCell)) {
237 datatype = XMLSchema.INTEGER;
238 } else if (isFloat(newCell)) {
239 datatype = XMLSchema.FLOAT;
240 }
241 object = SimpleValueFactory.getInstance().createLiteral(newCell, datatype);
242 }
243 return object;
244 }
245
246
247
248
249
250
251
252
253
254
255 private void addTableMetadataStatements(IRI documentIRI, ExtractionResult out, int numberOfRows,
256 int numberOfColumns) {
257 out.writeTriple(documentIRI, csv.numberOfRows,
258 SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfRows), XMLSchema.INTEGER));
259 out.writeTriple(documentIRI, csv.numberOfColumns,
260 SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfColumns), XMLSchema.INTEGER));
261 }
262
263
264
265
266 @Override
267 public ExtractorDescription getDescription() {
268 return CSVExtractorFactory.getDescriptionInstance();
269 }
270 }