1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.servlet;
19
20 import org.apache.any23.configuration.DefaultConfiguration;
21 import org.apache.any23.extractor.ExtractionParameters;
22 import org.apache.any23.extractor.ExtractorRegistry;
23 import org.apache.any23.extractor.ExtractorRegistryImpl;
24 import org.apache.any23.http.HTTPClient;
25 import org.apache.any23.plugin.Any23PluginManager;
26 import org.apache.any23.servlet.conneg.Any23Negotiator;
27 import org.apache.any23.servlet.conneg.MediaRangeSpec;
28 import org.apache.any23.source.ByteArrayDocumentSource;
29 import org.apache.any23.source.DocumentSource;
30 import org.apache.any23.source.HTTPDocumentSource;
31 import org.apache.any23.source.StringDocumentSource;
32 import org.eclipse.rdf4j.rio.RDFFormat;
33 import org.slf4j.Logger;
34 import org.slf4j.LoggerFactory;
35
36 import javax.servlet.ServletException;
37 import javax.servlet.http.HttpServlet;
38 import javax.servlet.http.HttpServletRequest;
39 import javax.servlet.http.HttpServletResponse;
40
41 import java.io.File;
42 import java.io.IOException;
43 import java.net.URI;
44 import java.net.URISyntaxException;
45 import java.util.regex.Pattern;
46
47 import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
48
49
50
51
52
53
54
55
56 public class Servlet extends HttpServlet {
57
58 private static final Logger LOG = LoggerFactory.getLogger(Servlet.class);
59
60 public static final String DEFAULT_BASE_IRI = "http://any23.org/tmp/";
61
62 private static final long serialVersionUID = 8207685628715421336L;
63
64 private static final Pattern schemeAndSingleSlashRegex =
65 Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:/[^/]");
66
67
68 private static final Pattern schemeRegex =
69 Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:");
70
71 @Override
72 protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws IOException, ServletException {
73 final WebResponder.html#WebResponder">WebResponder responder = new WebResponder(this, resp);
74 final String format = getFormatFromRequestOrNegotiation(req);
75 final boolean report = isReport(req);
76 final boolean annotate = isAnnotated(req);
77 final boolean openie = isOpenIE(req);
78 if (format == null) {
79 try {
80 responder.sendError(406, "Client accept header does not include a supported output format", report);
81 return;
82 } catch (IOException e) {
83 LOG.error("Unable to send error for null request format.", e);
84 }
85 }
86 final String uri = getInputIRIFromRequest(req);
87 if (uri == null) {
88 try {
89 responder.sendError(404, "Missing IRI in GET request. Try /format/http://example.com/myfile", report);
90 return;
91 } catch (Exception e) {
92 LOG.error("Unable to send error for null request IRI.", e);
93 }
94 }
95 if (openie) {
96 Any23PluginManager pManager = Any23PluginManager.getInstance();
97
98
99
100
101 File webappClasspath = new File(getClass().getClassLoader().getResource("").getPath());
102 File openIEJarPath = new File(webappClasspath.getParentFile().getPath() + "/lib/apache-any23-openie");
103 boolean loadedJars = pManager.loadJARDir(openIEJarPath);
104 if (loadedJars) {
105 ExtractorRegistry r = ExtractorRegistryImpl.getInstance();
106 try {
107 pManager.getExtractors().forEachRemaining(r::register);
108 } catch (IOException e) {
109 LOG.error("Error during dynamic classloading of JARs from OpenIE runtime directory {}", openIEJarPath.toString(), e);
110 }
111 LOG.info("Successful dynamic classloading of JARs from OpenIE runtime directory {}", openIEJarPath.toString());
112 }
113 }
114 final ExtractionParameters eps = getExtractionParameters(req);
115 try {
116 responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
117 } catch (IOException e) {
118 LOG.error("Unable to run extraction on HTTPDocumentSource.", e);
119 }
120 }
121
122 @Override
123 protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws IOException {
124 final WebResponder.html#WebResponder">WebResponder responder = new WebResponder(this, resp);
125 final boolean report = isReport(req);
126 final boolean annotate = isAnnotated(req);
127 final boolean openie = isOpenIE(req);
128 if (req.getContentType() == null) {
129 responder.sendError(400, "Invalid POST request, no Content-Type for the message body specified", report);
130 return;
131 }
132 final String uri = getInputIRIFromRequest(req);
133 final String format = getFormatFromRequestOrNegotiation(req);
134 if (format == null) {
135 responder.sendError(406, "Client accept header does not include a supported output format", report);
136 return;
137 }
138 if (openie) {
139 Any23PluginManager pManager = Any23PluginManager.getInstance();
140 pManager.loadJARDir(new File(getClass().getResource("apache-any23-openie").getPath()));
141 }
142 final ExtractionParameters eps = getExtractionParameters(req);
143 if ("application/x-www-form-urlencoded".equals(getContentTypeHeader(req))) {
144 if (uri != null) {
145 log("Attempting conversion to '" + format + "' from IRI <" + uri + ">");
146 responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
147 return;
148 }
149 if (req.getParameter("body") == null) {
150 responder.sendError(400, "Invalid POST request, parameter 'uri' or 'body' required", report);
151 return;
152 }
153 String type = null;
154 if (req.getParameter("type") != null && !"".equals(req.getParameter("type"))) {
155 type = req.getParameter("type");
156 }
157 log("Attempting conversion to '" + format + "' from body parameter");
158 responder.runExtraction(
159 new StringDocumentSource(req.getParameter("body"), Servlet.DEFAULT_BASE_IRI, type),
160 eps,
161 format,
162 report, annotate
163 );
164 return;
165 }
166 log("Attempting conversion to '" + format + "' from POST body");
167 responder.runExtraction(
168 new ByteArrayDocumentSource(
169 req.getInputStream(),
170 Servlet.DEFAULT_BASE_IRI,
171 getContentTypeHeader(req)
172 ),
173 eps,
174 format,
175 report, annotate
176 );
177 }
178
179 private String getFormatFromRequestOrNegotiation(HttpServletRequest request) {
180 String fromRequest = getFormatFromRequest(request);
181 if (fromRequest != null && !"".equals(fromRequest) && !"best".equals(fromRequest)) {
182 return fromRequest;
183 }
184 MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
185 if (result == null) {
186 return null;
187 } else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
188 return "n3";
189 } else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
190 return "nq";
191 } else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
192 return "rdf";
193 } else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
194 return "nt";
195 } else if (RDFFormat.JSONLD.hasMIMEType(result.getMediaType())) {
196 return "ld+json";
197 } else {
198 return "turtle";
199 }
200 }
201
202 private String getFormatFromRequest(HttpServletRequest request) {
203 if (request.getPathInfo() == null)
204 return "best";
205 String[] args = request.getPathInfo().split("/", 3);
206 if (args.length < 2 || "".equals(args[1])) {
207 if (request.getParameter("format") == null) {
208 return "best";
209 } else {
210 return request.getParameter("format");
211 }
212 }
213 return args[1];
214 }
215
216 private String getInputIRIFromRequest(HttpServletRequest request) {
217 if (request.getPathInfo() == null)
218 return null;
219 String[] args = request.getPathInfo().split("/", 3);
220 if (args.length < 3) {
221 if (request.getParameter("uri") != null) {
222 return request.getParameter("uri").trim();
223 }
224 if (request.getParameter("url") != null) {
225 return request.getParameter("url").trim();
226 }
227 return null;
228 }
229 String uri = args[2];
230 if (request.getQueryString() != null) {
231 uri = uri + "?" + request.getQueryString();
232 }
233 if (!hasScheme(uri)) {
234 uri = "http://" + uri;
235 } else if (hasOnlySingleSlashAfterScheme(uri)) {
236
237
238
239
240 uri = uri.replaceFirst(":/", "://");
241 }
242 return uri.trim();
243 }
244
245
246 private boolean hasScheme(String uri) {
247 return schemeRegex.matcher(uri).find();
248 }
249
250 private boolean hasOnlySingleSlashAfterScheme(String uri) {
251 return schemeAndSingleSlashRegex.matcher(uri).find();
252 }
253
254 private String getContentTypeHeader(HttpServletRequest req) {
255 String cType = "Content-Type";
256 if (req.getHeader(cType) == null)
257 return null;
258 if ("".equals(req.getHeader(cType)))
259 return null;
260 String contentType = req.getHeader(cType);
261
262 int index = contentType.indexOf(';');
263 if (index == -1)
264 return contentType;
265 return contentType.substring(0, index);
266 }
267
268 private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report)
269 throws IOException {
270 try {
271 if (!isValidIRI(uri)) {
272 throw new URISyntaxException(uri, "@@@");
273 }
274 return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri);
275 } catch (URISyntaxException ex) {
276 LOG.error("Invalid IRI detected", ex);
277 responder.sendError(400, "Invalid input IRI " + uri, report);
278 return null;
279 }
280 }
281
282 protected DocumentSource createHTTPDocumentSource(HTTPClient httpClient, String uri)
283 throws IOException, URISyntaxException {
284 return new HTTPDocumentSource(httpClient, uri);
285 }
286
287 private boolean isValidIRI(String s) {
288 try {
289 URI uri = new URI(s);
290 if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
291 return false;
292 }
293 } catch (Exception e) {
294 return false;
295 }
296 return true;
297 }
298
299 private ValidationMode getValidationMode(HttpServletRequest request) {
300 final String parameter = "validation-mode";
301 final String validationMode = request.getParameter(parameter);
302 if (validationMode == null)
303 return ValidationMode.NONE;
304 if ("none".equalsIgnoreCase(validationMode))
305 return ValidationMode.NONE;
306 if ("validate".equalsIgnoreCase(validationMode))
307 return ValidationMode.VALIDATE;
308 if ("validate-fix".equalsIgnoreCase(validationMode))
309 return ValidationMode.VALIDATE_AND_FIX;
310 throw new IllegalArgumentException(
311 String.format("Invalid value '%s' for '%s' parameter.", validationMode, parameter)
312 );
313 }
314
315 private ExtractionParameters getExtractionParameters(HttpServletRequest request) {
316 final ValidationMode mode = getValidationMode(request);
317 return new ExtractionParameters(DefaultConfiguration.singleton(), mode);
318 }
319
320 private boolean isReport(HttpServletRequest request) {
321 return request.getParameter("report") != null;
322 }
323
324 private boolean isAnnotated(HttpServletRequest request) {
325 return request.getParameter("annotate") != null;
326 }
327
328 private boolean isOpenIE(HttpServletRequest request) {
329 return request.getParameter("openie") != null;
330 }
331
332 }