1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.plugin.officescraper;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.Extractor;
25 import org.apache.any23.extractor.ExtractorDescription;
26 import org.apache.any23.rdf.RDFUtils;
27 import org.apache.any23.vocab.Excel;
28 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
29 import org.apache.poi.ss.usermodel.Cell;
30 import org.apache.poi.ss.usermodel.CellType;
31 import org.apache.poi.ss.usermodel.Row;
32 import org.apache.poi.ss.usermodel.Sheet;
33 import org.apache.poi.ss.usermodel.Workbook;
34 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
35 import org.eclipse.rdf4j.model.IRI;
36 import org.eclipse.rdf4j.model.vocabulary.RDF;
37
38 import java.io.IOException;
39 import java.io.InputStream;
40
41
42
43
44
45
46
47
48
49
50 public class ExcelExtractor implements Extractor.ContentExtractor {
51
52 private static final Excel excel = Excel.getInstance();
53
54 private boolean stopAtFirstError = false;
55
56 public ExcelExtractor() {}
57
58 public boolean isStopAtFirstError() {
59 return stopAtFirstError;
60 }
61
62 @Override
63 public void setStopAtFirstError(boolean f) {
64 stopAtFirstError = f;
65 }
66
67 @Override
68 public ExtractorDescription getDescription() {
69 return ExcelExtractorFactory.getDescriptionInstance();
70 }
71
72 @Override
73 public void run(
74 ExtractionParameters extractionParameters,
75 ExtractionContext context,
76 InputStream in,
77 ExtractionResult er
78 ) throws IOException, ExtractionException {
79 try {
80 final IRI documentIRI = context.getDocumentIRI();
81 final Workbook workbook = createWorkbook(documentIRI, in);
82 processWorkbook(documentIRI, workbook, er);
83 } catch (Exception e) {
84 throw new ExtractionException("An error occurred while extracting MS Excel content.", e);
85 }
86 }
87
88
89 private Workbook createWorkbook(IRI document, InputStream is) throws IOException {
90 final String documentIRI = document.toString();
91 if (documentIRI.endsWith(".xlsx")) {
92 return new XSSFWorkbook(is);
93 } else if (documentIRI.endsWith("xls")) {
94 return new HSSFWorkbook(is);
95 } else {
96 throw new IllegalArgumentException("Unsupported extension for resource [" + documentIRI + "]");
97 }
98 }
99
100 private void processWorkbook(IRI documentIRI, Workbook wb, ExtractionResult er) {
101 for (int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets(); sheetIndex++) {
102 final Sheet sheet = wb.getSheetAt(sheetIndex);
103 final IRI sheetIRI = getSheetIRI(documentIRI, sheet);
104 er.writeTriple(documentIRI, excel.containsSheet, sheetIRI);
105 er.writeTriple(sheetIRI, RDF.TYPE, excel.sheet);
106 writeSheetMetadata(sheetIRI, sheet, er);
107 for (Row row : sheet) {
108 final IRI rowIRI = getRowIRI(sheetIRI, row);
109 er.writeTriple(sheetIRI, excel.containsRow, rowIRI);
110 er.writeTriple(rowIRI, RDF.TYPE, excel.row);
111 writeRowMetadata(rowIRI, row, er);
112 for (Cell cell : row) {
113 writeCell(rowIRI, cell, er);
114 }
115 }
116 }
117 }
118
119 private void writeSheetMetadata(IRI sheetIRI, Sheet sheet, ExtractionResult er) {
120 final String sheetName = sheet.getSheetName();
121 final int firstRowNum = sheet.getFirstRowNum();
122 final int lastRowNum = sheet.getLastRowNum();
123 er.writeTriple(sheetIRI, excel.sheetName, RDFUtils.literal(sheetName));
124 er.writeTriple(sheetIRI, excel.firstRow, RDFUtils.literal(firstRowNum));
125 er.writeTriple(sheetIRI, excel.lastRow, RDFUtils.literal(lastRowNum));
126 }
127
128 private void writeRowMetadata(IRI rowIRI, Row row, ExtractionResult er) {
129 final int firstCellNum = row.getFirstCellNum();
130 final int lastCellNum = row.getLastCellNum();
131 er.writeTriple(rowIRI, excel.firstCell , RDFUtils.literal(firstCellNum));
132 er.writeTriple(rowIRI, excel.lastCell , RDFUtils.literal(lastCellNum ));
133 }
134
135 private void writeCell(IRI rowIRI, Cell cell, ExtractionResult er) {
136 final IRI cellType = cellTypeToType(cell.getCellType());
137 if (cellType == null)
138 return;
139 final IRI cellIRI = getCellIRI(rowIRI, cell);
140 er.writeTriple(rowIRI, excel.containsCell, cellIRI);
141 er.writeTriple(cellIRI, RDF.TYPE, excel.cell);
142 er.writeTriple(
143 cellIRI,
144 excel.cellValue,
145 RDFUtils.literal(cell.getStringCellValue(), cellType)
146 );
147 }
148
149 private IRI getSheetIRI(IRI documentIRI, Sheet sheet) {
150 return RDFUtils.iri(documentIRI.toString() + "/sheet/" + sheet.getSheetName());
151 }
152
153 private IRI getRowIRI(IRI sheetIRI, Row row) {
154 return RDFUtils.iri(sheetIRI.toString() + "/" + row.getRowNum());
155 }
156
157 private IRI getCellIRI(IRI rowIRI, Cell cell) {
158 return RDFUtils.iri(rowIRI +
159 String.format("/%d/", cell.getColumnIndex()));
160 }
161
162 private IRI cellTypeToType(CellType cellType) {
163 final String postfix;
164 if (cellType == null) {
165 postfix = null;
166 } else {
167 switch (cellType) {
168 case STRING:
169 postfix = "string";
170 break;
171 case BOOLEAN:
172 postfix = "boolean";
173 break;
174 case NUMERIC:
175 postfix = "numeric";
176 break;
177 default:
178 postfix = null;
179 }
180 }
181 return postfix == null ? null : RDFUtils.iri(excel.getNamespace().toString() + postfix);
182 }
183
184
185 }