1   package eu.fbk.dh.tint.heideltime.annotator;
2   
3   import de.unihd.dbs.heideltime.standalone.DocumentType;
4   import de.unihd.dbs.heideltime.standalone.HeidelTimeStandalone;
5   import edu.stanford.nlp.ling.CoreAnnotation;
6   import edu.stanford.nlp.ling.CoreAnnotations;
7   import edu.stanford.nlp.ling.CoreLabel;
8   import edu.stanford.nlp.pipeline.Annotation;
9   import edu.stanford.nlp.pipeline.Annotator;
10  import eu.fbk.utils.core.PropertiesUtils;
11  import org.w3c.dom.Document;
12  import org.w3c.dom.Element;
13  import org.w3c.dom.NodeList;
14  
15  import javax.xml.parsers.DocumentBuilder;
16  import javax.xml.parsers.DocumentBuilderFactory;
17  import java.io.ByteArrayInputStream;
18  import java.io.InputStream;
19  import java.text.DateFormat;
20  import java.text.SimpleDateFormat;
21  import java.util.*;
22  
23  /**
24   * Created by alessio on 10/08/16.
25   */
26  
27  public class HeidelTimeAnnotator implements Annotator {
28  
29      HeidelTimeStandalone tagger;
30      static DateFormat format = new SimpleDateFormat("yyyy-MM-dd", Locale.ENGLISH);
31  
32      class TimexObject {
33  
34          private int start;
35          private int end;
36          private String timexType;
37          private String timexValue;
38  
39          public int getStart() {
40              return start;
41          }
42  
43          public void setStart(int start) {
44              this.start = start;
45          }
46  
47          public int getEnd() {
48              return end;
49          }
50  
51          public void setEnd(int end) {
52              this.end = end;
53          }
54  
55          public String getTimexType() {
56              return timexType;
57          }
58  
59          public void setTimexType(String timexType) {
60              this.timexType = timexType;
61          }
62  
63          public String getTimexValue() {
64              return timexValue;
65          }
66  
67          public void setTimexValue(String timexValue) {
68              this.timexValue = timexValue;
69          }
70  
71          public TimexObject(int start, int end, String timexType, String timexValue) {
72              this.start = start;
73              this.end = end;
74              this.timexType = timexType;
75              this.timexValue = timexValue;
76          }
77      }
78  
79      public HeidelTimeAnnotator(String annotatorName, Properties props) {
80  
81          // Todo: load an instance for each type
82          // Todo: add document creation datetime
83  
84          String configFile = props.getProperty(annotatorName + ".config", null);
85          String dtString = props.getProperty(annotatorName + ".type", "news");
86  
87          DocumentType documentType;
88          try {
89              documentType = DocumentType.valueOf(dtString.toUpperCase());
90          } catch (Exception e) {
91              documentType = DocumentType.NEWS;
92          }
93  
94          if (configFile == null) {
95              Properties convertedProperties = PropertiesUtils.dotConvertedProperties(props, annotatorName);
96              tagger = HeidelTimeModel.getInstance(convertedProperties, documentType).getTagger();
97          } else {
98              tagger = HeidelTimeModel.getInstance(configFile, documentType).getTagger();
99          }
100     }
101 
102     /**
103      * Given an Annotation, perform a task on this Annotation.
104      *
105      * @param annotation
106      */
107     @Override public void annotate(Annotation annotation) {
108         String text = annotation.get(CoreAnnotations.TextAnnotation.class);
109         if (text != null) {
110 
111             try {
112                 Date documentDate = new Date();
113 
114                 try {
115                     String creationDate = annotation.get(CoreAnnotations.DocDateAnnotation.class);
116                     documentDate = format.parse(creationDate);
117                 } catch (Exception e) {
118                     // ignored
119                 }
120                 String process = tagger.process(text, documentDate);
121 
122                 DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
123                 DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
124 
125                 InputStream textStream = new ByteArrayInputStream(process.getBytes());
126 
127                 Document doc = dBuilder.parse(textStream);
128                 doc.getDocumentElement().normalize();
129 
130                 Map<Integer, TimexObject> timexes = new HashMap<>();
131                 List<TimexObject> finalTimexes = new ArrayList<>();
132                 NodeList entries = doc.getElementsByTagName("*");
133 
134                 for (int i = 1; i < entries.getLength(); i++) {
135                     Element element = (Element) entries.item(i);
136                     if (element.getNodeName().equals("heideltime:Timex3")) {
137                         int begin = Integer.parseInt(element.getAttribute("begin"));
138                         int end = Integer.parseInt(element.getAttribute("end"));
139 
140                         String timexType = element.getAttribute("timexType");
141                         String timexValue = element.getAttribute("timexValue");
142 
143                         TimexObject timexObject = new TimexObject(begin, end, timexType, timexValue);
144                         finalTimexes.add(timexObject);
145                         timexes.put(begin, timexObject);
146                     }
147                 }
148 
149                 List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
150                 TimexObject timexObject = null;
151 
152                 for (CoreLabel token : tokens) {
153                     int begin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
154                     int end = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
155 
156                     if (timexObject != null && end > timexObject.getEnd()) {
157                         timexObject = null;
158                     }
159                     if (timexes.containsKey(begin)) {
160                         timexObject = timexes.get(begin);
161                     }
162 
163                     if (timexObject != null) {
164                         token.set(CoreAnnotations.NamedEntityTagAnnotation.class, timexObject.getTimexType());
165                         token.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class,
166                                 timexObject.getTimexValue());
167                         token.set(CoreAnnotations.ValueAnnotation.class,
168                                 text.substring(timexObject.getStart(), timexObject.getEnd()));
169                     }
170                 }
171 
172                 annotation.set(HeidelTimeAnnotations.TimexesAnnotation.class, finalTimexes);
173             } catch (Exception e) {
174                 e.printStackTrace();
175             }
176 
177         }
178     }
179 
180     /**
181      * Returns a set of requirements for which tasks this annotator can
182      * provide.  For example, the POS annotator will return "pos".
183      */
184     @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
185         return Collections.emptySet();
186     }
187 
188     /**
189      * Returns the set of tasks which this annotator requires in order
190      * to perform.  For example, the POS annotator will return
191      * "tokenize", "ssplit".
192      */
193     @Override public Set<Class<? extends CoreAnnotation>> requires() {
194         return Collections.singleton(CoreAnnotations.TokensAnnotation.class);
195     }
196 }