1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.rdf;
19
20 import org.apache.any23.util.MathUtils;
21 import org.apache.any23.util.StringUtils;
22 import org.eclipse.rdf4j.model.BNode;
23 import org.eclipse.rdf4j.model.IRI;
24 import org.eclipse.rdf4j.model.Literal;
25 import org.eclipse.rdf4j.model.Resource;
26 import org.eclipse.rdf4j.model.Statement;
27 import org.eclipse.rdf4j.model.Value;
28 import org.eclipse.rdf4j.model.ValueFactory;
29 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
30 import org.eclipse.rdf4j.model.vocabulary.RDF;
31 import org.eclipse.rdf4j.rio.RDFFormat;
32 import org.eclipse.rdf4j.rio.RDFParser;
33 import org.eclipse.rdf4j.rio.RDFParserRegistry;
34 import org.eclipse.rdf4j.rio.RDFWriter;
35 import org.eclipse.rdf4j.rio.Rio;
36 import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
37 import org.eclipse.rdf4j.rio.helpers.StatementCollector;
38 import org.slf4j.Logger;
39 import org.slf4j.LoggerFactory;
40
41 import javax.xml.datatype.DatatypeConfigurationException;
42 import javax.xml.datatype.DatatypeFactory;
43 import javax.xml.datatype.XMLGregorianCalendar;
44 import java.io.ByteArrayInputStream;
45 import java.io.IOException;
46 import java.io.InputStream;
47 import java.io.OutputStream;
48 import java.io.Writer;
49 import java.net.URISyntaxException;
50 import java.nio.charset.StandardCharsets;
51 import java.text.ParseException;
52 import java.text.SimpleDateFormat;
53 import java.util.Collection;
54 import java.util.Date;
55 import java.util.GregorianCalendar;
56 import java.util.Locale;
57 import java.util.Optional;
58 import java.util.TimeZone;
59
60
61
62
63
64
65
66
67 public class RDFUtils {
68
69 private static int nodeId = 0;
70
71 private static final ValueFactory valueFactory = SimpleValueFactory.getInstance();
72
73 private static final Logger LOG = LoggerFactory.getLogger(RDFUtils.class);
74
75 private static final Statement[] EMPTY_STATEMENTS = new Statement[0];
76
77 private RDFUtils() {
78 }
79
80
81
82
83
84
85
86
87
88
89
90
91 public static String fixAbsoluteIRI(String uri) {
92 String fixed = fixIRIWithException(uri);
93 if (!fixed.matches("[a-zA-Z0-9]+:/.*"))
94 throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri);
95
96 if (fixed.matches("https?://[a-zA-Z0-9.-]+(:[0-9+])?")) {
97 fixed = fixed + "/";
98 }
99 return fixed;
100 }
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118 public static String getXSDDate(String dateToBeParsed, String format)
119 throws ParseException, DatatypeConfigurationException {
120 SimpleDateFormat simpleDateFormat = new SimpleDateFormat(format, Locale.ROOT);
121 Date date = simpleDateFormat.parse(dateToBeParsed);
122 GregorianCalendar gc = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
123 gc.setTime(date);
124 XMLGregorianCalendar xml = DatatypeFactory.newInstance().newXMLGregorianCalendar(gc);
125 xml.setTimezone(0);
126 return xml.toString();
127 }
128
129
130
131
132
133
134
135
136
137 public static String toXSDDateTime(Date date) {
138 SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
139 String s = simpleDateFormat.format(date);
140 StringBuilder sb = new StringBuilder(s);
141 sb.insert(22, ':');
142 return sb.toString();
143 }
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159 public static String fixIRIWithException(String unescapedIRI) {
160 if (unescapedIRI == null)
161 throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null");
162
163
164 String escapedIRI = unescapedIRI.trim();
165
166
167 escapedIRI = escapedIRI.replaceAll(" ", "%20");
168
169
170 escapedIRI = escapedIRI.replaceAll("\n", "");
171
172
173 if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\""))
174 escapedIRI = escapedIRI.substring(1);
175
176 if (escapedIRI.endsWith("\\") || escapedIRI.endsWith("\""))
177 escapedIRI = escapedIRI.substring(0, escapedIRI.length() - 1);
178
179
180 if (escapedIRI.matches("^[a-zA-Z0-9]+:/?/?$"))
181 throw new IllegalArgumentException("no authority in org.eclipse.rdf4j.model.IRI: " + unescapedIRI);
182
183
184 if (escapedIRI.matches("^javascript:"))
185 throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI starts with javascript: " + unescapedIRI);
186
187
188
189
190
191 escapedIRI = escapedIRI.replaceAll(">.*$", "");
192
193
194 if (escapedIRI.matches("[<>\\[\\]|\\*\\{\\}\"\\\\]"))
195 throw new IllegalArgumentException("Invalid character in org.eclipse.rdf4j.model.IRI: " + unescapedIRI);
196
197 return escapedIRI;
198 }
199
200
201
202
203
204
205
206
207
208 public static org.eclipse.rdf4j.model.IRI iri(String iri) {
209 return valueFactory.createIRI(iri);
210 }
211
212
213
214
215
216
217
218
219
220
221
222 public static org.eclipse.rdf4j.model.IRI iri(String namespace, String localName) {
223 return valueFactory.createIRI(namespace, localName);
224 }
225
226
227
228
229
230
231
232
233
234 public static Literal literal(String s) {
235 return valueFactory.createLiteral(s);
236 }
237
238
239
240
241
242
243
244
245
246 public static Literal literal(boolean b) {
247 return valueFactory.createLiteral(b);
248 }
249
250
251
252
253
254
255
256
257
258 public static Literal literal(byte b) {
259 return valueFactory.createLiteral(b);
260 }
261
262
263
264
265
266
267
268
269
270 public static Literal literal(short s) {
271 return valueFactory.createLiteral(s);
272 }
273
274
275
276
277
278
279
280
281
282 public static Literal literal(int i) {
283 return valueFactory.createLiteral(i);
284 }
285
286
287
288
289
290
291
292
293
294 public static Literal literal(long l) {
295 return valueFactory.createLiteral(l);
296 }
297
298
299
300
301
302
303
304
305
306 public static Literal literal(float f) {
307 return valueFactory.createLiteral(f);
308 }
309
310
311
312
313
314
315
316
317
318 public static Literal literal(double d) {
319 return valueFactory.createLiteral(d);
320 }
321
322
323
324
325
326
327
328
329
330
331
332 public static Literal literal(String s, String l) {
333 if (l == null) {
334
335 return valueFactory.createLiteral(s);
336 } else {
337 return valueFactory.createLiteral(s, l);
338 }
339 }
340
341
342
343
344
345
346
347
348
349
350
351 public static Literal literal(String s, org.eclipse.rdf4j.model.IRI datatype) {
352 return valueFactory.createLiteral(s, datatype);
353 }
354
355
356
357
358
359
360
361
362
363
364 public static BNode bnode(String id) {
365 return valueFactory.createBNode(id);
366 }
367
368
369
370
371 public static BNode bnode() {
372 return valueFactory.createBNode();
373 }
374
375
376
377
378
379
380
381
382
383
384 public static BNode getBNode(String id) {
385 return valueFactory.createBNode("node" + MathUtils.md5(id));
386 }
387
388
389
390
391
392
393
394
395
396
397
398
399
400 public static Statement triple(Resource s, org.eclipse.rdf4j.model.IRI p, Value o) {
401 return valueFactory.createStatement(s, p, o);
402 }
403
404
405
406
407
408
409
410
411
412
413
414
415
416 public static Statement triple(String s, String p, String o) {
417 return valueFactory.createStatement((Resource) toValue(s), (org.eclipse.rdf4j.model.IRI) toValue(p),
418 toValue(o));
419 }
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435 public static Statement quad(Resource s, org.eclipse.rdf4j.model.IRI p, Value o, Resource g) {
436 return valueFactory.createStatement(s, p, o, g);
437 }
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453 public static Statement quad(String s, String p, String o, String g) {
454 return valueFactory.createStatement((Resource) toValue(s), (org.eclipse.rdf4j.model.IRI) toValue(p), toValue(o),
455 (Resource) toValue(g));
456 }
457
458
459
460
461
462
463
464
465
466
467 public static Value toValue(String s) {
468 if ("a".equals(s))
469 return RDF.TYPE;
470 if (s.matches("[a-z0-9]+:.*")) {
471 return PopularPrefixes.get().expand(s);
472 }
473 return valueFactory.createLiteral(s);
474 }
475
476
477
478
479
480
481
482 public static Collection<RDFFormat> getFormats() {
483 return RDFParserRegistry.getInstance().getKeys();
484 }
485
486
487
488
489
490
491
492
493
494
495
496
497 public static RDFParser getParser(RDFFormat format) {
498 return Rio.createParser(format);
499 }
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514 public static RDFWriter getWriter(RDFFormat format, Writer writer) {
515 return Rio.createWriter(format, writer);
516 }
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531 public static RDFWriter getWriter(RDFFormat format, OutputStream os) {
532 return Rio.createWriter(format, os);
533 }
534
535
536
537
538
539
540
541
542
543
544
545
546 public static Optional<RDFFormat> getFormatByExtension(String ext) {
547 if (!ext.startsWith("."))
548 ext = "." + ext;
549 return Rio.getParserFormatForFileName(ext);
550 }
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568 public static Statement[] parseRDF(RDFFormat format, InputStream is, String baseIRI) throws IOException {
569 final StatementCollector handler = new StatementCollector();
570 final RDFParser parser = getParser(format);
571 parser.getParserConfig().set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
572 parser.setPreserveBNodeIDs(true);
573 parser.setRDFHandler(handler);
574 parser.parse(is, baseIRI);
575 return handler.getStatements().toArray(EMPTY_STATEMENTS);
576 }
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592 public static Statement[] parseRDF(RDFFormat format, InputStream is) throws IOException {
593 return parseRDF(format, is, "");
594 }
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610 public static Statement[] parseRDF(RDFFormat format, String in) throws IOException {
611 return parseRDF(format, new ByteArrayInputStream(in.getBytes(StandardCharsets.UTF_8)));
612 }
613
614
615
616
617
618
619
620
621
622
623
624
625 public static Statement[] parseRDF(String resource) throws IOException {
626 final int extIndex = resource.lastIndexOf('.');
627 if (extIndex == -1)
628 throw new IllegalArgumentException("Error while detecting the extension in resource name " + resource);
629 final String extension = resource.substring(extIndex + 1);
630 return parseRDF(getFormatByExtension(extension).orElseThrow(Rio.unsupportedFormat(extension)),
631 RDFUtils.class.getResourceAsStream(resource));
632 }
633
634
635
636
637
638
639
640
641
642 public static boolean isAbsoluteIRI(String href) {
643 try {
644 SimpleValueFactory.getInstance().createIRI(href.trim());
645 new java.net.URI(href.trim());
646 return true;
647 } catch (IllegalArgumentException e) {
648 LOG.trace("Error processing href: {}", href, e);
649 return false;
650 } catch (URISyntaxException e) {
651 LOG.trace("Error interpreting href: {} as URI.", href, e);
652 return false;
653 }
654 }
655
656
657
658
659
660
661
662
663
664
665 public static Resource makeIRI(IRI docUri) {
666 return makeIRI("node", docUri);
667 }
668
669
670
671
672
673
674
675
676
677
678
679
680
681 public static Resource makeIRI(String type, IRI docIRI) {
682 return makeIRI(type, docIRI, false);
683 }
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703 public static Resource makeIRI(String type, IRI docIRI, boolean addId) {
704
705
706
707 String newType = StringUtils.implementJavaNaming(type);
708
709 String iriString;
710 if (docIRI.toString().endsWith("/") || docIRI.toString().endsWith("#")) {
711 iriString = docIRI.toString() + newType;
712 } else {
713 iriString = docIRI.toString() + "#" + newType;
714 }
715
716 if (addId) {
717 iriString = iriString + "_" + Integer.toString(nodeId);
718 }
719
720 Resource node = RDFUtils.iri(iriString);
721 if (addId) {
722 nodeId++;
723 }
724 return node;
725 }
726
727
728
729
730
731
732
733
734
735
736
737
738 public static Value makeIRI(String inString) {
739 if (RDFUtils.isAbsoluteIRI(inString)) {
740 return RDFUtils.iri(inString);
741 } else {
742 return RDFUtils.literal(inString);
743 }
744 }
745
746 public static Value makeIRI() {
747 BNode bnode = bnode(Integer.toString(nodeId));
748 nodeId++;
749 return bnode;
750 }
751
752 }