1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.servlet;
19
20 import java.io.ByteArrayOutputStream;
21 import java.io.IOException;
22 import java.io.PrintStream;
23 import java.nio.charset.Charset;
24 import java.security.cert.CertificateException;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.stream.Collectors;
29 import javax.servlet.ServletOutputStream;
30 import javax.servlet.http.HttpServletResponse;
31 import org.apache.any23.Any23;
32 import org.apache.any23.ExtractionReport;
33 import org.apache.any23.configuration.Settings;
34 import org.apache.any23.extractor.ExtractionException;
35 import org.apache.any23.extractor.ExtractionParameters;
36 import org.apache.any23.extractor.Extractor;
37 import org.apache.any23.extractor.IssueReport;
38 import org.apache.any23.filter.IgnoreAccidentalRDFa;
39 import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
40 import org.apache.any23.source.DocumentSource;
41 import org.apache.any23.validator.SerializationException;
42 import org.apache.any23.validator.XMLValidationReportSerializer;
43 import org.apache.any23.writer.CompositeTripleHandler;
44 import org.apache.any23.writer.CountingTripleHandler;
45 import org.apache.any23.writer.FormatWriter;
46 import org.apache.any23.writer.TripleWriterFactory;
47 import org.apache.any23.writer.ReportingTripleHandler;
48 import org.apache.any23.writer.TripleHandler;
49 import org.apache.any23.writer.TripleHandlerException;
50 import org.apache.any23.writer.WriterFactory;
51 import org.apache.any23.writer.WriterFactoryRegistry;
52
53
54
55
56
57 class WebResponder {
58
59 private static final WriterFactoryRegistry writerRegistry = WriterFactoryRegistry.getInstance();
60
61
62
63
64 private final Any23 runner;
65
66
67
68
69 private Servlet any23servlet;
70
71
72
73
74 private HttpServletResponse response;
75
76
77
78
79 private TripleHandler rdfWriter = null;
80
81
82
83
84 private ReportingTripleHandler reporter = null;
85
86
87
88
89 private String outputMediaType = null;
90
91
92
93
94 private ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();
95
96 public WebResponder(Servlet any23servlet, HttpServletResponse response) {
97 this.any23servlet = any23servlet;
98 this.response = response;
99 this.runner = new Any23();
100 runner.setHTTPUserAgent("Apache Any23 Servlet http://any23.org/");
101 }
102
103 protected Any23 getRunner() {
104 return runner;
105 }
106
107 public void runExtraction(
108 DocumentSource in,
109 ExtractionParameters eps,
110 String format,
111 boolean report, boolean annotate
112 ) throws IOException {
113 if (in == null)
114 return;
115 if (!initRdfWriter(format, report, annotate))
116 return;
117 ExtractionReport er = null;
118 try {
119 er = runner.extract(eps, in, rdfWriter);
120 rdfWriter.close();
121 if (! er.hasMatchingExtractors() ) {
122 sendError(
123 415,
124 "No suitable extractor found for this media type",
125 null,
126 er,
127 report
128 );
129 return;
130 }
131 } catch (IOException ioe) {
132
133 if (ioe.getCause() instanceof CertificateException) {
134 final String errMsg = "Could not fetch input, IO Error.";
135 any23servlet.log(errMsg, ioe.getCause());
136 sendError(502, errMsg, ioe, null, report);
137 return;
138 }
139 any23servlet.log("Could not fetch input", ioe);
140 sendError(502, "Could not fetch input.", ioe, null, report);
141 return;
142 } catch (ExtractionException e) {
143 if (rdfWriter != null) {
144 try {
145 rdfWriter.close();
146 } catch (TripleHandlerException the) {
147 throw new RuntimeException("Error while closing TripleHandler", the);
148 }
149 }
150
151
152
153 String extractionError = "Failed to fully parse input. The extraction result, at the bottom "
154 + "of this response, if any, will contain extractions only up until the extraction error.";
155 any23servlet.log(extractionError, e);
156 sendError(502, extractionError, e, er, report);
157 return;
158 } catch (Exception e) {
159 any23servlet.log("Internal error", e);
160 sendError(500, "Internal error.", e, null, report);
161 return;
162 }
163
164
165 any23servlet.log("Extraction complete, " + reporter.getTotalTriples() + " triples");
166
167
168 response.setContentType(outputMediaType);
169 response.setStatus(200);
170
171 final String charsetEncoding = er.getEncoding();
172 if (Charset.isSupported(charsetEncoding)) {
173 response.setCharacterEncoding(er.getEncoding());
174 } else {
175 response.setCharacterEncoding("UTF-8");
176 }
177
178 final ServletOutputStream sos = response.getOutputStream();
179 final byte[] data = byteOutStream.toByteArray();
180 if(report) {
181 final PrintStream ps = new PrintStream(sos);
182 try {
183 printHeader(ps);
184 printResponse(reporter, er, data, ps);
185 } catch (Exception e) {
186 throw new RuntimeException("An error occurred while serializing the output response.", e);
187 } finally {
188 ps.close();
189 }
190 } else {
191 sos.write(data);
192 }
193 }
194
195 public void sendError(int code, String msg, boolean report) throws IOException {
196 sendError(code, msg, null, null, report);
197 }
198
199 private void printHeader(PrintStream ps) {
200 ps.println("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
201 }
202
203 private void printResponse(ReportingTripleHandler rth, ExtractionReport er, byte[] data, PrintStream ps) {
204 ps.println("<response>");
205 printExtractors(rth, ps);
206 printReport(null, null, er, ps);
207 printData(data, ps);
208 ps.println("</response>");
209 }
210
211 private void printExtractors(ReportingTripleHandler rth, PrintStream ps) {
212 ps.println("<extractors>");
213 for (String extractor : rth.getExtractorNames()) {
214 ps.print("<extractor>");
215 ps.print(extractor);
216 ps.println("</extractor>");
217 }
218 ps.println("</extractors>");
219 }
220
221 private void printIssueReport(ExtractionReport er, PrintStream ps) {
222 ps.println("<issueReport>");
223 for(Extractor<?> extractor : er.getMatchingExtractors()) {
224 final String name = extractor.getDescription().getExtractorName();
225 final Collection<IssueReport.Issue> extractorIssues = er.getExtractorIssues(name);
226 if(extractorIssues.isEmpty())
227 continue;
228 ps.println( String.format("<extractorIssues extractor=\"%s\">", name));
229 for(IssueReport.Issue issue : er.getExtractorIssues(name)) {
230 ps.println(
231 String.format(
232 "<issue level=\"%s\" row=\"%d\" col=\"%d\">%s</issue>",
233 issue.getLevel().toString(),
234 issue.getRow(),
235 issue.getCol(),
236 issue.getMessage()
237 )
238 );
239 }
240 ps.println("</extractorIssues>");
241 }
242 ps.println("</issueReport>");
243
244 }
245
246 private void printReport(String msg, Throwable e, ExtractionReport er, PrintStream ps) {
247 XMLValidationReportSerializer#XMLValidationReportSerializer">XMLValidationReportSerializer reportSerializer = new XMLValidationReportSerializer();
248 ps.println("<report>");
249
250
251 if(msg != null) {
252 ps.printf("<message>%s</message>%n", msg);
253 } else {
254 ps.print("<message/>\n");
255 }
256
257
258 if(e != null) {
259 ps.println("<error>");
260 ps.println("<![CDATA[");
261 e.printStackTrace(ps);
262 ps.println("]]>");
263 ps.println("</error>");
264 } else {
265 ps.println("<error/>");
266 }
267
268
269 printIssueReport(er, ps);
270
271
272 try {
273 reportSerializer.serialize(er.getValidationReport(), ps);
274 } catch (SerializationException se) {
275 ps.println("An error occurred while serializing error.");
276 se.printStackTrace(ps);
277 }
278 ps.println("</report>");
279 }
280
281 private void printData(byte[] data, PrintStream ps) {
282 ps.println("<data>");
283 ps.println("<![CDATA[");
284 try {
285 ps.write(data);
286 } catch (IOException ioe) {
287 ps.println("An error occurred while serializing data.");
288 ioe.printStackTrace(ps);
289 }
290 ps.println("]]>");
291 ps.println("</data>");
292 }
293
294 private void sendError(int code, String msg, Exception e, ExtractionReport er, boolean report)
295 throws IOException {
296 response.setStatus(code);
297 response.setContentType("text/plain");
298 final ServletOutputStream sos = response.getOutputStream();
299 final PrintStream ps = new PrintStream(sos);
300 final byte[] data = byteOutStream.toByteArray();
301 if (report) {
302 try {
303 printHeader(ps);
304 printReport(msg, e, er, ps);
305 } finally {
306 ps.close();
307 }
308 } else {
309 ps.println(msg);
310 if (e != null) {
311 ps.println("================================================================");
312 e.printStackTrace(ps);
313 ps.println("================================================================");
314 printData(data, ps);
315 }
316 }
317 }
318
319 private boolean initRdfWriter(String format, boolean report, boolean annotate) throws IOException {
320 final WriterFactory factory = getFormatWriter(format);
321 if (!(factory instanceof TripleWriterFactory)) {
322 sendError(
323 400,
324 "Invalid format '" + format + "', try one of: "
325 + writerRegistry.getWriters().stream()
326 .filter(f -> f instanceof TripleWriterFactory)
327 .map(WriterFactory::getIdentifier).collect(Collectors.toList()),
328 null,
329 null,
330 report
331 );
332 return false;
333 }
334 TripleHandler fw = ((TripleWriterFactory) factory).getTripleWriter(byteOutStream, Settings.of());
335 if (fw instanceof FormatWriter) {
336 ((FormatWriter)fw).setAnnotated(annotate);
337 }
338 outputMediaType = ((TripleWriterFactory) factory).getTripleFormat().getMimeType();
339 List<TripleHandler> tripleHandlers = new ArrayList<>();
340 tripleHandlers.add(new IgnoreAccidentalRDFa(fw));
341 tripleHandlers.add(new CountingTripleHandler());
342 rdfWriter = new CompositeTripleHandler(tripleHandlers);
343 reporter = new ReportingTripleHandler(rdfWriter);
344 rdfWriter = new IgnoreAccidentalRDFa(
345 new IgnoreTitlesOfEmptyDocuments(reporter),
346 true
347 );
348 return true;
349 }
350
351 private WriterFactory getFormatWriter(String format) throws IOException {
352 final String finalFormat;
353
354 if ("rdf".equals(format) || "xml".equals(format) || "rdfxml".equals(format)) {
355 finalFormat = "rdfxml";
356 } else if ("turtle".equals(format) || "ttl".equals(format)) {
357 finalFormat = "turtle";
358 } else if ("n3".equals(format)) {
359 finalFormat = "turtle";
360 } else if ("n-triples".equals(format) || "ntriples".equals(format) || "nt".equals(format)) {
361 finalFormat = "ntriples";
362 } else if("nquads".equals(format) || "n-quads".equals(format) || "nq".equals(format)) {
363 finalFormat = "nquads";
364 } else if("trix".equals(format)) {
365 finalFormat = "trix";
366 } else if("json".equals(format)) {
367 finalFormat = "json";
368 } else if("jsonld".equals(format)){
369 finalFormat = "jsonld";
370 } else {
371 return null;
372 }
373 return writerRegistry.getWriterByIdentifier(finalFormat);
374 }
375
376 }