1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionResult;
21 import org.apache.any23.extractor.ExtractorDescription;
22 import org.apache.any23.extractor.TagSoupExtractionResult;
23 import org.apache.any23.vocab.DOAC;
24 import org.apache.any23.vocab.FOAF;
25 import org.eclipse.rdf4j.model.BNode;
26 import org.eclipse.rdf4j.model.Resource;
27 import org.eclipse.rdf4j.model.vocabulary.RDF;
28 import org.w3c.dom.Node;
29
30 import java.util.List;
31
32
33
34
35
36
37 public class HResumeExtractor extends EntityBasedMicroformatExtractor {
38
39 private static final FOAF vFOAF = FOAF.getInstance();
40 private static final DOAC vDOAC = DOAC.getInstance();
41
42 @Override
43 public ExtractorDescription getDescription() {
44 return HResumeExtractorFactory.getDescriptionInstance();
45 }
46
47 @Override
48 public String getBaseClassName() {
49 return "hresume";
50 }
51
52 @Override
53 protected void resetExtractor() {
54
55 }
56
57 @Override
58 protected boolean extractEntity(Node node, ExtractionResult out) {
59 if (null == node)
60 return false;
61 BNode person = getBlankNodeFor(node);
62
63 out.writeTriple(person, RDF.TYPE, vFOAF.Person);
64 final HTMLDocument fragment = new HTMLDocument(node);
65 addSummary(fragment, person);
66 addContact(fragment, person);
67 addExperiences(fragment, person);
68 addEducations(fragment, person);
69 addAffiliations(fragment, person);
70 addSkills(fragment, person);
71
72 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
73 tser.addResourceRoot(DomUtils.getXPathListForNode(node), person, this.getClass());
74
75 return true;
76 }
77
78 private void addSummary(HTMLDocument doc, Resource person) {
79 HTMLDocument.TextField summary = doc.getSingularTextField("summary");
80 conditionallyAddStringProperty(summary.source(), person, vDOAC.summary, summary.value());
81 }
82
83 private void addContact(HTMLDocument doc, Resource person) {
84 List<Node> nodes = doc.findAllByClassName("contact");
85 if (nodes.size() > 0)
86 addBNodeProperty(nodes.get(0), person, vFOAF.isPrimaryTopicOf, getBlankNodeFor(nodes.get(0)));
87 }
88
89 private void addExperiences(HTMLDocument doc, Resource person) {
90 List<Node> nodes = doc.findAllByClassName("experience");
91 for (Node node : nodes) {
92 BNode exp = valueFactory.createBNode();
93 if (addExperience(exp, new HTMLDocument(node)))
94 addBNodeProperty(node, person, vDOAC.experience, exp);
95 }
96 }
97
98 private boolean addExperience(Resource exp, HTMLDocument document) {
99 final Node documentNode = document.getDocument();
100 String check = "";
101
102 HTMLDocument.TextField value = document.getSingularTextField("title");
103 check += value;
104 conditionallyAddStringProperty(value.source(), exp, vDOAC.title, value.value().trim());
105
106 value = document.getSingularTextField("dtstart");
107 check += value;
108 conditionallyAddStringProperty(documentNode, exp, vDOAC.start_date, value.value().trim());
109
110 value = document.getSingularTextField("dtend");
111 check += value;
112 conditionallyAddStringProperty(documentNode, exp, vDOAC.end_date, value.value().trim());
113
114 value = document.getSingularTextField("summary");
115 check += value;
116 conditionallyAddStringProperty(documentNode, exp, vDOAC.organization, value.value().trim());
117
118 return !"".equals(check);
119 }
120
121 private void addEducations(HTMLDocument doc, Resource person) {
122 List<Node> nodes = doc.findAllByClassName("education");
123 for (Node node : nodes) {
124 BNode exp = valueFactory.createBNode();
125 if (addExperience(exp, new HTMLDocument(node)))
126 addBNodeProperty(node, person, vDOAC.education, exp);
127 }
128 }
129
130 private void addAffiliations(HTMLDocument doc, Resource person) {
131 List<Node> nodes = doc.findAllByClassName("affiliation");
132 for (Node node : nodes) {
133 addBNodeProperty(node, person, vDOAC.affiliation, getBlankNodeFor(node));
134 }
135 }
136
137 private void addSkills(HTMLDocument doc, Resource person) {
138 List<Node> nodes;
139
140
141 nodes = doc.findAllByClassName("skill");
142 for (Node node : nodes) {
143 conditionallyAddStringProperty(node, person, vDOAC.skill, extractSkillValue(node));
144 }
145
146 nodes = doc.findAllByClassName("skills");
147 for (Node node : nodes) {
148 String nodeText = node.getTextContent();
149 String[] skills = nodeText.split(",");
150 for (String skill : skills) {
151 conditionallyAddStringProperty(node, person, vDOAC.skill, skill.trim());
152 }
153 }
154 }
155
156 private String extractSkillValue(Node n) {
157 String name = n.getNodeName();
158 String skill = null;
159 if ("A".equals(name) && DomUtils.hasAttribute(n, "rel", "tag")) {
160 skill = n.getAttributes().getNamedItem("href").getTextContent();
161 } else {
162 skill = n.getTextContent();
163 }
164 return skill;
165 }
166
167 }