1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html.microformats2;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.TagSoupExtractionResult;
24 import org.apache.any23.vocab.HResume;
25 import org.eclipse.rdf4j.model.BNode;
26 import org.eclipse.rdf4j.model.Resource;
27 import org.eclipse.rdf4j.model.vocabulary.RDF;
28 import org.w3c.dom.Node;
29 import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
30 import org.apache.any23.extractor.html.HTMLDocument;
31 import org.apache.any23.extractor.html.DomUtils;
32 import java.util.List;
33
34
35
36
37
38
39 public class HResumeExtractor extends EntityBasedMicroformatExtractor {
40
41 private static final HResume vResume = HResume.getInstance();
42
43 private static final String[] resumeFields = { "name", "summary", "contact", "education", "experience", "skill",
44 "affiliation" };
45
46 @Override
47 public ExtractorDescription getDescription() {
48 return HResumeExtractorFactory.getDescriptionInstance();
49 }
50
51 @Override
52 public String getBaseClassName() {
53 return Microformats2Prefixes.CLASS_PREFIX + "resume";
54 }
55
56 @Override
57 protected void resetExtractor() {
58
59 }
60
61 @Override
62 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
63 if (null == node)
64 return false;
65 BNode person = getBlankNodeFor(node);
66 out.writeTriple(person, RDF.TYPE, vResume.Resume);
67 final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
68
69 addName(fragment, person);
70 addSummary(fragment, person);
71 addSkills(fragment, person);
72
73 addExperiences(fragment, person);
74 addEducations(fragment, person);
75
76 addAffiliations(fragment, person);
77 addContacts(fragment, person);
78
79 final TagSoupExtractionResult../org/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
80 tser.addResourceRoot(DomUtils.getXPathListForNode(node), person, this.getClass());
81
82 return true;
83 }
84
85 private void addContacts(HTMLDocument doc, Resource entry) throws ExtractionException {
86 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[2]
87 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
88 if (nodes.isEmpty())
89 return;
90 HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
91 HCardExtractor extractor = factory.createExtractor();
92 for (Node node : nodes) {
93 BNode contact = valueFactory.createBNode();
94 addIRIProperty(contact, RDF.TYPE, vResume.contact);
95 extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), contact, getCurrentExtractionResult());
96 }
97 }
98
99 private void addAffiliations(HTMLDocument doc, Resource entry) throws ExtractionException {
100 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[6]
101 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
102 if (nodes.isEmpty())
103 return;
104 HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
105 HCardExtractor extractor = factory.createExtractor();
106 for (Node node : nodes) {
107 BNode affiliation = valueFactory.createBNode();
108 addIRIProperty(affiliation, RDF.TYPE, vResume.affiliation);
109 extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), affiliation,
110 getCurrentExtractionResult());
111 }
112 }
113
114 private void addName(HTMLDocument doc, Resource person) {
115 HTMLDocument.TextField name = doc.getSingularTextField(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[0]);
116 conditionallyAddStringProperty(name.source(), person, vResume.name, name.value());
117 }
118
119 private void addSummary(HTMLDocument doc, Resource person) {
120 HTMLDocument.TextField summary = doc
121 .getSingularTextField(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[1]);
122 conditionallyAddStringProperty(summary.source(), person, vResume.summary, summary.value());
123 }
124
125 private void addSkills(HTMLDocument doc, Resource person) {
126 final HTMLDocument.TextField[] skills = doc
127 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[5]);
128 for (HTMLDocument.TextField skill : skills) {
129 conditionallyAddStringProperty(skill.source(), person, vResume.skill, skill.value());
130 }
131
132 }
133
134 private void addExperiences(HTMLDocument doc, Resource person) throws ExtractionException {
135 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[4]
136 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "event");
137 if (nodes.isEmpty())
138 return;
139 HEventExtractorFactoryrmats2/HEventExtractorFactory.html#HEventExtractorFactory">HEventExtractorFactory factory = new HEventExtractorFactory();
140 HEventExtractor extractor = factory.createExtractor();
141 for (Node node : nodes) {
142 BNode event = valueFactory.createBNode();
143 addIRIProperty(event, RDF.TYPE, vResume.experience);
144 extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), event, getCurrentExtractionResult());
145 }
146 }
147
148 private void addEducations(HTMLDocument doc, Resource person) throws ExtractionException {
149 List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[3]
150 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "event");
151 if (nodes.isEmpty())
152 return;
153 HEventExtractorFactoryrmats2/HEventExtractorFactory.html#HEventExtractorFactory">HEventExtractorFactory factory = new HEventExtractorFactory();
154 HEventExtractor extractor = factory.createExtractor();
155 for (Node node : nodes) {
156 BNode event = valueFactory.createBNode();
157 addIRIProperty(event, RDF.TYPE, vResume.education);
158 extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), event, getCurrentExtractionResult());
159 }
160 }
161 }