1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.AbstractAny23TestBase;
21 import org.apache.any23.extractor.IssueReport;
22 import org.apache.any23.extractor.IssueReport.Issue;
23 import org.apache.any23.extractor.IssueReport.IssueLevel;
24 import org.apache.any23.extractor.ExtractionException;
25 import org.apache.any23.extractor.ExtractorFactory;
26 import org.apache.any23.extractor.SingleDocumentExtraction;
27 import org.apache.any23.extractor.SingleDocumentExtractionReport;
28 import org.apache.any23.rdf.RDFUtils;
29 import org.apache.any23.vocab.SINDICE;
30 import org.apache.any23.writer.RepositoryWriter;
31 import org.junit.After;
32 import org.junit.Assert;
33 import org.junit.Before;
34 import org.eclipse.rdf4j.common.iteration.Iterations;
35 import org.eclipse.rdf4j.model.BNode;
36 import org.eclipse.rdf4j.model.Literal;
37 import org.eclipse.rdf4j.model.Resource;
38 import org.eclipse.rdf4j.model.Statement;
39 import org.eclipse.rdf4j.model.IRI;
40 import org.eclipse.rdf4j.model.Value;
41 import org.eclipse.rdf4j.repository.RepositoryConnection;
42 import org.eclipse.rdf4j.repository.RepositoryException;
43 import org.eclipse.rdf4j.repository.RepositoryResult;
44 import org.eclipse.rdf4j.repository.sail.SailRepository;
45 import org.eclipse.rdf4j.rio.RDFFormat;
46 import org.eclipse.rdf4j.rio.RDFHandlerException;
47 import org.eclipse.rdf4j.rio.RDFParseException;
48 import org.eclipse.rdf4j.rio.Rio;
49 import org.eclipse.rdf4j.sail.Sail;
50 import org.eclipse.rdf4j.sail.memory.MemoryStore;
51 import org.slf4j.Logger;
52 import org.slf4j.LoggerFactory;
53
54 import java.io.ByteArrayOutputStream;
55 import java.io.IOException;
56 import java.io.PrintStream;
57 import java.io.StringWriter;
58 import java.lang.invoke.MethodHandles;
59 import java.nio.charset.StandardCharsets;
60 import java.util.ArrayList;
61 import java.util.Collection;
62 import java.util.Collections;
63 import java.util.List;
64 import java.util.Locale;
65 import java.util.Map;
66
67
68
69
70 public abstract class AbstractExtractorTestCase extends AbstractAny23TestBase {
71
72 private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
73
74
75
76
77 protected static IRI baseIRI = RDFUtils.iri("http://bob.example.com/");
78
79
80
81
82 protected RepositoryConnection conn;
83
84
85
86
87 private SingleDocumentExtractionReport report;
88
89 private Sail store;
90
91 private SailRepository repository;
92
93
94
95
96 public AbstractExtractorTestCase() {
97 super();
98 }
99
100
101
102
103 protected abstract ExtractorFactory<?> getExtractorFactory();
104
105
106
107
108
109
110
111 @Before
112 public void setUp() throws Exception {
113 super.setUp();
114 store = new MemoryStore();
115 repository = new SailRepository(store);
116 repository.init();
117 conn = repository.getConnection();
118 }
119
120
121
122
123
124
125
126
127 @After
128 public void tearDown() throws RepositoryException {
129 try {
130 conn.close();
131 } finally {
132 repository.shutDown();
133 }
134 conn = null;
135 report = null;
136 store = null;
137 repository = null;
138 }
139
140
141
142
143 protected RepositoryConnection getConnection() {
144 return conn;
145 }
146
147
148
149
150 protected SingleDocumentExtractionReport getReport() {
151 return report;
152 }
153
154
155
156
157
158
159
160
161
162 protected Collection<IssueReport.Issue> getIssues(String extractorName) {
163 for (Map.Entry<String, Collection<IssueReport.Issue>> issueEntry : report.getExtractorToIssues().entrySet()) {
164 if (issueEntry.getKey().equals(extractorName)) {
165 return issueEntry.getValue();
166 }
167 }
168 return Collections.emptyList();
169 }
170
171
172
173
174
175
176 protected Collection<IssueReport.Issue> getIssues() {
177 return getIssues(getExtractorFactory().getExtractorName());
178 }
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194 protected void extract(String resource) throws ExtractionException, IOException {
195 SingleDocumentExtraction ex = new SingleDocumentExtraction(
196 new HTMLFixture(copyResourceToTempFile(resource)).getOpener(baseIRI.stringValue()),
197 getExtractorFactory(), new RepositoryWriter(conn));
198 ex.setMIMETypeDetector(null);
199 report = ex.run();
200 }
201
202
203
204
205
206
207
208
209
210 protected void assertExtract(String resource, boolean assertNoIssues) {
211 try {
212 extract(resource);
213 if (assertNoIssues)
214 assertNoIssues();
215 } catch (ExtractionException ex) {
216 throw new RuntimeException(ex);
217 } catch (IOException ex) {
218 throw new RuntimeException(ex);
219 }
220 }
221
222
223
224
225
226
227
228
229 protected void assertExtract(String resource) {
230 assertExtract(resource, true);
231 }
232
233
234
235
236
237
238
239
240
241
242
243
244
245 protected void assertContains(IRI p, Resource o) throws RepositoryException {
246 assertContains(null, p, o);
247 }
248
249
250
251
252
253
254
255
256
257
258
259
260
261 protected void assertContains(IRI p, String o) throws RepositoryException {
262 assertContains(null, p, RDFUtils.literal(o));
263 }
264
265
266
267
268
269
270
271
272
273
274
275
276
277 protected void assertNotContains(IRI p, Resource o) throws RepositoryException {
278 assertNotContains(null, p, o);
279 }
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295 protected void assertContains(Resource s, IRI p, Value o) throws RepositoryException {
296 Assert.assertTrue(
297 getFailedExtractionMessage() + String.format(Locale.ROOT, "Cannot find triple (%s %s %s)", s, p, o),
298 conn.hasStatement(s, p, o, false));
299 }
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315 protected void assertNotContains(Resource s, IRI p, String o) throws RepositoryException {
316 Assert.assertFalse(getFailedExtractionMessage(), conn.hasStatement(s, p, RDFUtils.literal(o), false));
317 }
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333 protected void assertNotContains(Resource s, IRI p, Resource o) throws RepositoryException {
334 Assert.assertFalse(getFailedExtractionMessage(), conn.hasStatement(s, p, o, false));
335 }
336
337
338
339
340
341
342
343
344 protected void assertModelNotEmpty() throws RepositoryException {
345 Assert.assertFalse("The model is expected to not be empty." + getFailedExtractionMessage(), conn.isEmpty());
346 }
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362 protected void assertNotContains(Resource s, IRI p, Literal o) throws RepositoryException {
363 Assert.assertFalse(getFailedExtractionMessage(), conn.hasStatement(s, p, o, false));
364 }
365
366
367
368
369
370
371
372
373 protected void assertModelEmpty() throws RepositoryException {
374 Assert.assertTrue(getFailedExtractionMessage(), conn.isEmpty());
375 }
376
377
378
379
380 protected void assertNoIssues() {
381 for (Map.Entry<String, Collection<IssueReport.Issue>> entry : report.getExtractorToIssues().entrySet()) {
382 if (entry.getValue().size() > 0) {
383 log.debug("Unexpected issue for extractor " + entry.getKey() + " : " + entry.getValue());
384 }
385 for (Issue nextIssue : entry.getValue()) {
386 if (nextIssue.getLevel() == IssueLevel.ERROR || nextIssue.getLevel() == IssueLevel.FATAL) {
387 Assert.fail("Unexpected issue for extractor " + entry.getKey() + " : " + entry.getValue());
388 }
389 }
390 }
391 }
392
393
394
395
396
397
398
399
400
401 protected void assertIssue(IssueReport.IssueLevel level, String issueRegex) {
402 final Collection<IssueReport.Issue> issues = getIssues(getExtractorFactory().getExtractorName());
403 boolean found = false;
404 for (IssueReport.Issue issue : issues) {
405 if (issue.getLevel() == level && issue.getMessage().matches(issueRegex)) {
406 found = true;
407 break;
408 }
409 }
410 Assert.assertTrue(String.format(Locale.ROOT, "Cannot find issue with level %s matching expression '%s'", level,
411 issueRegex), found);
412 }
413
414
415
416
417
418
419
420
421
422
423
424 public void assertContainsModel(Statement[] statements) throws RepositoryException {
425 for (Statement statement : statements) {
426 assertContains(statement);
427 }
428 }
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446 public void assertContainsModel(String modelResource)
447 throws RDFHandlerException, IOException, RDFParseException, RepositoryException {
448 getConnection().remove(null, SINDICE.getInstance().date, (Value) null, (Resource) null);
449 getConnection().remove(null, SINDICE.getInstance().size, (Value) null, (Resource) null);
450 assertContainsModel(RDFUtils.parseRDF(modelResource));
451 }
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469 protected void assertStatementsSize(Resource s, IRI p, Value o, int expected)
470 throws RDFHandlerException, RepositoryException {
471 int statementsSize = getStatementsSize(s, p, o);
472 if (statementsSize != expected) {
473 final ByteArrayOutputStream baos = new ByteArrayOutputStream();
474 PrintStream ps = new PrintStream(baos, true, StandardCharsets.UTF_8);
475 getConnection().exportStatements(s, p, o, true, Rio.createWriter(RDFFormat.NQUADS, ps));
476 }
477
478 Assert.assertEquals("Unexpected number of matching statements.", expected, statementsSize);
479 }
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495 protected void assertStatementsSize(IRI p, Value o, int expected) throws RDFHandlerException, RepositoryException {
496 assertStatementsSize(null, p, o, expected);
497 }
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513 protected void assertStatementsSize(IRI p, String o, int expected) throws RDFHandlerException, RepositoryException {
514 assertStatementsSize(p, o == null ? null : RDFUtils.literal(o), expected);
515 }
516
517
518
519
520
521
522
523
524
525
526
527
528
529 protected void assertNotFound(Resource s, IRI p) throws RepositoryException {
530 RepositoryResult<Statement> statements = conn.getStatements(s, p, null, true);
531 try {
532 Assert.assertFalse("Expected no statements.", statements.hasNext());
533 } finally {
534 statements.close();
535 }
536 }
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552 protected Resource findExactlyOneBlankSubject(IRI p, Value o) throws RepositoryException {
553 RepositoryResult<Statement> it = conn.getStatements(null, p, o, false);
554 try {
555 Assert.assertTrue(getFailedExtractionMessage(), it.hasNext());
556 Statement stmt = it.next();
557 Resource result = stmt.getSubject();
558 Assert.assertTrue(getFailedExtractionMessage(), result instanceof BNode);
559 Assert.assertFalse(getFailedExtractionMessage(), it.hasNext());
560 return result;
561 } finally {
562 it.close();
563 }
564 }
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580 protected Value findExactlyOneObject(Resource s, IRI p) throws RepositoryException {
581 RepositoryResult<Statement> it = conn.getStatements(s, p, null, false);
582 try {
583 Assert.assertTrue(getFailedExtractionMessage(), it.hasNext());
584 return it.next().getObject();
585 } finally {
586 it.close();
587 }
588 }
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604 protected List<Resource> findSubjects(IRI p, Value o) throws RepositoryException {
605 RepositoryResult<Statement> it = conn.getStatements(null, p, o, false);
606 List<Resource> subjects = new ArrayList<Resource>();
607 try {
608 Statement statement;
609 while (it.hasNext()) {
610 statement = it.next();
611 subjects.add(statement.getSubject());
612 }
613 } finally {
614 it.close();
615 }
616 return subjects;
617 }
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633 protected List<Value> findObjects(Resource s, IRI p) throws RepositoryException {
634 RepositoryResult<Statement> it = conn.getStatements(s, p, null, false);
635 List<Value> objects = new ArrayList<Value>();
636 try {
637 Statement statement;
638 while (it.hasNext()) {
639 statement = it.next();
640 objects.add(statement.getObject());
641 }
642 } finally {
643 it.close();
644 }
645 return objects;
646 }
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661 protected Value findObject(Resource s, IRI p) throws RepositoryException {
662 RepositoryResult<Statement> statements = conn.getStatements(s, p, null, true);
663 try {
664 Assert.assertTrue("Expected at least a statement.", statements.hasNext());
665 return (statements.next().getObject());
666 } finally {
667 statements.close();
668 }
669 }
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685 protected Resource findObjectAsResource(Resource s, IRI p) throws RepositoryException {
686 final Value v = findObject(s, p);
687 try {
688 return (Resource) v;
689 } catch (ClassCastException cce) {
690 Assert.fail("Expected resource object, found: " + v.getClass().getSimpleName());
691 throw new IllegalStateException();
692 }
693 }
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709 protected String findObjectAsLiteral(Resource s, IRI p) throws RepositoryException {
710 return findObject(s, p).stringValue();
711 }
712
713
714
715
716
717
718
719
720
721
722 protected String dumpModelToTurtle() throws RepositoryException {
723 StringWriter w = new StringWriter();
724 try {
725 conn.export(Rio.createWriter(RDFFormat.TURTLE, w));
726 return w.toString();
727 } catch (RDFHandlerException ex) {
728 throw new RuntimeException(ex);
729 }
730 }
731
732
733
734
735
736
737
738
739
740
741 protected String dumpModelToNQuads() throws RepositoryException {
742 StringWriter w = new StringWriter();
743 try {
744 conn.export(Rio.createWriter(RDFFormat.NQUADS, w));
745 return w.toString();
746 } catch (RDFHandlerException ex) {
747 throw new RuntimeException(ex);
748 }
749 }
750
751
752
753
754
755
756
757
758
759
760 protected String dumpModelToRDFXML() throws RepositoryException {
761 StringWriter w = new StringWriter();
762 try {
763 conn.export(Rio.createWriter(RDFFormat.RDFXML, w));
764 return w.toString();
765 } catch (RDFHandlerException ex) {
766 throw new RuntimeException(ex);
767 }
768 }
769
770
771
772
773
774
775
776
777
778
779 protected List<Statement> dumpAsListOfStatements() throws RepositoryException {
780 return Iterations.asList(conn.getStatements(null, null, null, false));
781 }
782
783
784
785
786
787
788
789
790 protected String dumpHumanReadableTriples() throws RepositoryException {
791 StringBuilder sb = new StringBuilder();
792 RepositoryResult<Statement> result = conn.getStatements(null, null, null, false);
793 while (result.hasNext()) {
794 Statement statement = result.next();
795 sb.append(String.format(Locale.ROOT, "%s %s %s %s\n", statement.getSubject(), statement.getPredicate(),
796 statement.getObject(), statement.getContext()));
797
798 }
799 return sb.toString();
800 }
801
802
803
804
805
806
807
808
809
810
811
812
813
814 protected void assertContains(Statement statement) throws RepositoryException {
815 Assert.assertTrue("Cannot find statement " + statement + " in model.",
816 conn.hasStatement(statement.getSubject() instanceof BNode ? null : statement.getSubject(),
817 statement.getPredicate(), statement.getObject() instanceof BNode ? null : statement.getObject(),
818 false));
819 }
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835 protected void assertContains(Resource s, IRI p, String l) throws RepositoryException {
836 assertContains(s, p, RDFUtils.literal(l));
837 }
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855 protected void assertContains(Resource s, IRI p, String l, String lang) throws RepositoryException {
856 assertContains(s, p, RDFUtils.literal(l, lang));
857 }
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875 protected RepositoryResult<Statement> getStatements(Resource s, IRI p, Value o) throws RepositoryException {
876 return conn.getStatements(s, p, o, false);
877 }
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895 protected int getStatementsSize(Resource s, IRI p, Value o) throws RepositoryException {
896 RepositoryResult<Statement> result = getStatements(s, p, o);
897 int count = 0;
898 try {
899 while (result.hasNext()) {
900 result.next();
901 count++;
902 }
903 } finally {
904 result.close();
905 }
906 return count;
907 }
908
909 private String getFailedExtractionMessage() throws RepositoryException {
910 return "Assertion failed! Extracted triples:\n" + dumpModelToNQuads();
911 }
912
913 }