1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.any23.plugin.extractor.openie;
18
19 import java.io.IOException;
20 import java.util.List;
21
22 import javax.xml.transform.TransformerConfigurationException;
23 import javax.xml.transform.TransformerFactoryConfigurationError;
24
25 import org.apache.any23.extractor.Extractor;
26 import org.apache.any23.extractor.IssueReport;
27 import org.apache.any23.extractor.ExtractionContext;
28 import org.apache.any23.extractor.ExtractorDescription;
29 import org.apache.any23.plugin.Author;
30 import org.apache.any23.rdf.RDFUtils;
31 import org.apache.any23.util.StreamUtils;
32 import org.apache.tika.Tika;
33 import org.apache.tika.exception.TikaException;
34 import org.eclipse.rdf4j.model.IRI;
35 import org.eclipse.rdf4j.model.Resource;
36 import org.eclipse.rdf4j.model.Value;
37 import org.eclipse.rdf4j.model.vocabulary.RDF;
38 import org.eclipse.rdf4j.model.vocabulary.RDFS;
39 import org.apache.any23.extractor.ExtractionException;
40 import org.apache.any23.extractor.ExtractionParameters;
41 import org.apache.any23.extractor.ExtractionResult;
42
43 import org.slf4j.Logger;
44 import org.slf4j.LoggerFactory;
45 import org.w3c.dom.Document;
46
47 import edu.knowitall.openie.Argument;
48 import edu.knowitall.openie.Instance;
49 import edu.knowitall.openie.OpenIE;
50 import edu.knowitall.tool.parse.ClearParser;
51 import edu.knowitall.tool.postag.ClearPostagger;
52 import edu.knowitall.tool.srl.ClearSrl;
53 import edu.knowitall.tool.tokenize.ClearTokenizer;
54 import scala.collection.JavaConversions;
55 import scala.collection.Seq;
56
57
58
59
60
61
62 @Author(name="Lewis John McGibbney (lewismc@apache.org)")
63 public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor {
64
65 private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class);
66
67
68
69
70 public OpenIEExtractor() {
71
72 }
73
74
75
76
77 @Override
78 public ExtractorDescription getDescription() {
79 return OpenIEExtractorFactory.getDescriptionInstance();
80 }
81
82 @Override
83 public void run(ExtractionParameters extractionParameters,
84 ExtractionContext context, Document in, ExtractionResult out)
85 throws IOException, ExtractionException {
86
87 Runtime runtime = Runtime.getRuntime();
88 long maxMemory = runtime.maxMemory();
89
90 runtime.gc();
91 long usedMemory = Math.max(0L, runtime.totalMemory() - runtime.freeMemory());
92 long availableMemory = maxMemory - usedMemory;
93 if (availableMemory < 4294967296L) {
94 out.notifyIssue(IssueReport.IssueLevel.FATAL,
95 "Not enough heap space available to perform OpenIE extraction: "
96 + (availableMemory/1048576L) + "/" + (maxMemory / 1048576L)
97 + " MB. Requires 4096 MB.", -1, -1);
98 LOG.error("Increase JVM heap size when running OpenIE extractor. max=" + maxMemory + "; available=" + availableMemory);
99 return;
100 }
101
102 IRI documentIRI = context.getDocumentIRI();
103 RDFUtils.iri(documentIRI.toString() + "root");
104 out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
105 out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
106 LOG.debug("Processing: {}", documentIRI.toString());
107
108 OpenIE openIE = new OpenIE(
109 new ClearParser(
110 new ClearPostagger(
111 new ClearTokenizer())), new ClearSrl(), false, false);
112
113 Seq<Instance> extractions = null;
114 Tika tika = new Tika();
115 try {
116 extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in)));
117 } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
118 LOG.error("Encountered error during OpenIE extraction.", e);
119 } catch (TikaException e) {
120 LOG.error("Encountered error whilst parsing InputStream with Tika.", e);
121 }
122
123 List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions);
124
125
126
127
128
129
130 String thresholdString;
131 try {
132 thresholdString = extractionParameters.getProperty("any23.extraction.openie.confidence.threshold");
133 } catch (RuntimeException e) {
134 thresholdString = null;
135 }
136 double threshold = thresholdString == null ? 0.5 : Double.parseDouble(thresholdString);
137 for(Instance instance : listExtractions) {
138 if (instance.confidence() > threshold) {
139 List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s());
140 for(Argument argument : listArg2s) {
141 Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI);
142 IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI);
143 Value object = RDFUtils.toValue(argument.text());
144 out.writeTriple(subject, predicate, object);
145 }
146 }
147 }
148 }
149 }