1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.ExtractorDescription;
25 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
26 import org.apache.any23.vocab.DCTerms;
27 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
28 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
29 import org.w3c.dom.Document;
30
31 import java.io.IOException;
32
33
34
35
36
37
38 public class TitleExtractor implements TagSoupDOMExtractor {
39
40 private static final DCTerms vDCTERMS = DCTerms.getInstance();
41
42 @Override
43 public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
44 ExtractionResult out) throws IOException, ExtractionException {
45 final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance(),
46 out, extractionContext.getDefaultLanguage());
47
48 try {
49 String title = DomUtils.find(in, "/HTML/HEAD/TITLE/text()").trim();
50 if (title != null && (title.length() != 0)) {
51 out.writeTriple(extractionContext.getDocumentIRI(), vDCTERMS.title, valueFactory.createLiteral(title));
52 }
53 } finally {
54 valueFactory.setIssueReport(null);
55 }
56 }
57
58 @Override
59 public ExtractorDescription getDescription() {
60 return TitleExtractorFactory.getDescriptionInstance();
61 }
62
63 }