1   package eu.fbk.dh.tint.upos;
2   
3   import edu.stanford.nlp.ling.CoreAnnotation;
4   import edu.stanford.nlp.ling.CoreAnnotations;
5   import edu.stanford.nlp.ling.CoreLabel;
6   import edu.stanford.nlp.pipeline.Annotation;
7   import edu.stanford.nlp.pipeline.Annotator;
8   import edu.stanford.nlp.util.ArraySet;
9   import eu.fbk.utils.corenlp.CustomAnnotations;
10  
11  import java.util.*;
12  
13  public class UPosAnnotator implements Annotator {
14  
15      static Map<String, String> uposMap = new HashMap<>();
16      static String DEFAULT_UPOS = "X";
17  
18      static {
19          uposMap.put("A", "ADJ");
20          uposMap.put("AP", "DET");
21          uposMap.put("B", "ADV");
22          uposMap.put("BN", "ADV");
23          uposMap.put("CC", "CCONJ");
24          uposMap.put("CS", "SCONJ");
25          uposMap.put("DD", "DET");
26          uposMap.put("DE", "DET");
27          uposMap.put("DI", "DET");
28          uposMap.put("DQ", "DET");
29          uposMap.put("DR", "DET");
30          uposMap.put("E", "ADP");
31          uposMap.put("FB", "PUNCT");
32          uposMap.put("FC", "PUNCT");
33          uposMap.put("FF", "PUNCT");
34          uposMap.put("FS", "PUNCT");
35          uposMap.put("I", "INTJ");
36          uposMap.put("N", "NUM");
37          uposMap.put("NO", "ADJ");
38          uposMap.put("PART", "PART");
39          uposMap.put("PC", "PRON");
40          uposMap.put("PD", "PRON");
41          uposMap.put("PE", "PRON");
42          uposMap.put("PI", "PRON");
43          uposMap.put("PP", "PRON");
44          uposMap.put("PQ", "PRON");
45          uposMap.put("PR", "PRON");
46          uposMap.put("RD", "DET");
47          uposMap.put("RI", "DET");
48          uposMap.put("S", "NOUN");
49          uposMap.put("SP", "PROPN");
50          uposMap.put("SYM", "SYM");
51          uposMap.put("T", "DET");
52          uposMap.put("V", "VERB");
53          uposMap.put("VA", "AUX");
54          uposMap.put("VM", "AUX");
55      }
56  
57      @Override
58      public void annotate(Annotation annotation) {
59          for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
60              String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
61  
62              String[] parts = pos.split("\\+");
63              StringBuffer upos = new StringBuffer();
64              for (String part : parts) {
65                  String thisPos = uposMap.getOrDefault(part, DEFAULT_UPOS);
66                  upos.append("+").append(thisPos);
67              }
68              token.set(CustomAnnotations.UPosAnnotation.class, upos.substring(1));
69          }
70  
71      }
72  
73      @Override
74      public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
75          return Collections.singleton(CustomAnnotations.UPosAnnotation.class);
76      }
77  
78      @Override
79      public Set<Class<? extends CoreAnnotation>> requires() {
80          return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
81                  CoreAnnotations.PartOfSpeechAnnotation.class,
82                  CoreAnnotations.TokensAnnotation.class
83          )));
84      }
85  }