1   package eu.fbk.dh.tint.digimorph.annotator;
2   
3   import com.google.common.collect.SortedSetMultimap;
4   import com.google.common.collect.TreeMultimap;
5   import edu.stanford.nlp.ling.CoreAnnotation;
6   import edu.stanford.nlp.ling.CoreAnnotations;
7   import edu.stanford.nlp.ling.CoreLabel;
8   import edu.stanford.nlp.pipeline.Annotation;
9   import edu.stanford.nlp.pipeline.Annotator;
10  import edu.stanford.nlp.util.ArraySet;
11  import edu.stanford.nlp.util.CoreMap;
12  import eu.fbk.fcw.utils.ConllToken;
13  import eu.fbk.utils.core.PropertiesUtils;
14  import eu.fbk.utils.corenlp.CustomAnnotations;
15  
16  import java.util.*;
17  
18  /**
19   * Created by giovannimoretti on 19/05/16.
20   *
21   * @version 0.42a
22   */
23  public class DigiLemmaAnnotator implements Annotator {
24  
25      private static Map<String, String> pos_morpho_mapping = new HashMap<>();
26      private static Map<String, String> guessMap = new HashMap<>();
27      private static boolean DEFAULT_USE_GUESSER = true;
28      private static boolean DEFAULT_FEATURES = true;
29  
30      private boolean useGuesser, extractFeatures;
31      private GuessModel guesser;
32  
33      static private final String auxiliary = "VA";
34      static private final String verb = "V";
35      static private final String pNoun = "SP";
36      static private final Set<String> betweenAuxAndVerb = new HashSet<>(Arrays.asList("B", "BN"));
37  
38      static {
39          pos_morpho_mapping.put("A", "+adj");
40          pos_morpho_mapping.put("AP", "+adj");
41          pos_morpho_mapping.put("B", "+adv");
42          pos_morpho_mapping.put("BN", "+adv");
43          pos_morpho_mapping.put("C", "+conj");
44          pos_morpho_mapping.put("CC", "+conj");
45          pos_morpho_mapping.put("CS", "+conj");
46          pos_morpho_mapping.put("DD", "+adj");
47          pos_morpho_mapping.put("DE", "+adj");
48          pos_morpho_mapping.put("DI", "+adj");
49          pos_morpho_mapping.put("DQ", "+adj");
50          pos_morpho_mapping.put("DR", "+adj");
51          pos_morpho_mapping.put("DT", "+adj");
52          pos_morpho_mapping.put("E", "+prep");
53          pos_morpho_mapping.put("E+RD", "+prep");
54          pos_morpho_mapping.put("I", "+inter");
55          pos_morpho_mapping.put("N", "+adj");
56          pos_morpho_mapping.put("NO", "+adj");
57          pos_morpho_mapping.put("PC", "+pron");
58          pos_morpho_mapping.put("PD", "+pron");
59          pos_morpho_mapping.put("PE", "+pron");
60          pos_morpho_mapping.put("PI", "+pron");
61          pos_morpho_mapping.put("PP", "+pron");
62          pos_morpho_mapping.put("PQ", "+pron");
63          pos_morpho_mapping.put("PR", "+pron");
64          pos_morpho_mapping.put("RD", "+art");
65          pos_morpho_mapping.put("RI", "+art");
66          pos_morpho_mapping.put("S", "+n+");
67          pos_morpho_mapping.put("SP", "+n+");
68          pos_morpho_mapping.put("T", "+adj");
69          pos_morpho_mapping.put("V", "+v+");
70          pos_morpho_mapping.put("VA", "+v+");
71          pos_morpho_mapping.put("VA", "+v+");
72          pos_morpho_mapping.put("VM", "+v+");
73          pos_morpho_mapping.put("VM", "+v+");
74          pos_morpho_mapping.put("V+PC", "+v+");
75          guessMap.put("A", "ADJ");
76          guessMap.put("S", "NOUN");
77          guessMap.put("V", "VERB");
78          guessMap.put("B", "ADV");
79      }
80  
81      public DigiLemmaAnnotator(String annotatorName, Properties prop) {
82          useGuesser = PropertiesUtils.getBoolean(prop.getProperty(annotatorName + ".use_guesser"), DEFAULT_USE_GUESSER);
83          extractFeatures = PropertiesUtils.getBoolean(prop.getProperty(annotatorName + ".extract_features"), DEFAULT_FEATURES);
84  
85          //todo: the model is unique
86          if (useGuesser || extractFeatures) {
87              guesser = GuessModelInstance.getInstance().getModel();
88          }
89      }
90  
91      public void annotate(Annotation annotation) {
92          if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
93              for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
94  
95                  String last_valuable_genre = "";
96                  Boolean valid_aux = false;
97  
98                  List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
99                  for (CoreLabel token : tokens) {
100 
101 
102                     String[] morph_fatures = token.get(DigiMorphAnnotations.MorphoAnnotation.class).split("\\s+");
103                     String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
104                     boolean isGuessable = guessMap.containsKey(pos);
105 
106                     boolean chosenGuess = true;
107                     String chosenLemma = morph_fatures[0];
108                     String chosenMorpho = "";
109                     String chosenFeaturesString = "";
110                     SortedSetMultimap<String, String> chosenFeatures = TreeMultimap.create();
111 
112                     if (!pos.equals(verb)) {
113                         if (pos.equals(auxiliary) || (valid_aux && betweenAuxAndVerb.contains(pos))) {
114                             valid_aux = true;
115                         } else {
116                             valid_aux = false;
117                         }
118                     }
119 
120                     if (!pos.equals(pNoun)) {
121 
122                         if (morph_fatures.length > 1) {
123 
124                             // One possible candidate
125                             if (morph_fatures.length == 2) {
126 
127                                 String finalMorpho = morph_fatures[1];
128 
129                                 if (finalMorpho.contains("+art") || finalMorpho.equals("+adj")) {
130                                     if (finalMorpho.contains("+m+")) {
131                                         last_valuable_genre = "m";
132                                     } else {
133                                         last_valuable_genre = "f";
134                                     }
135                                 }
136 
137                                 String featMapped = pos_morpho_mapping.get(pos);
138                                 boolean shouldBeGuessed = featMapped == null || !finalMorpho.contains(featMapped);
139 
140                                 if (isGuessable && useGuesser && shouldBeGuessed) {
141                                     GuessModel.Token guess = guesser.guess(token.word(), guessMap.get(pos));
142                                     chosenLemma = guess.lemma;
143                                 } else {
144                                     chosenLemma = finalMorpho.split("\\+")[0].split("~")[0];
145                                     chosenMorpho = finalMorpho;
146                                     if (!shouldBeGuessed) {
147                                         chosenGuess = false;
148                                     }
149                                 }
150                             }
151 
152                             // More candidates
153                             else {
154                                 // woking with multiple features element
155 
156                                 String featMapped = pos_morpho_mapping.get(pos);
157 
158                                 String possibleCandidate = "";
159                                 String firstCandidate = "";
160 
161                                 if (featMapped != null) {
162                                     for (String feature : morph_fatures) {
163                                         if (feature.contains(featMapped)) {
164                                             if (firstCandidate.length() == 0) {
165                                                 firstCandidate = feature;
166                                             }
167 
168                                             if (featMapped.equals("+art") || featMapped.equals("+adj")) {
169                                                 if (feature.contains("+m+")) {
170                                                     last_valuable_genre = "m";
171                                                 } else if (feature.contains("+f+")) {
172                                                     last_valuable_genre = "f";
173                                                 }
174                                             }
175 
176                                             if (last_valuable_genre.equals("m") && feature.contains("+m+")) {
177                                                 possibleCandidate = feature;
178                                             } else if (last_valuable_genre.equals("f") && feature.contains("+f+")) {
179                                                 possibleCandidate = feature;
180                                             }
181 
182                                             if (valid_aux && feature.contains("+part+")) {
183                                                 possibleCandidate = feature;
184                                                 valid_aux = false;
185                                             }
186 
187                                         }
188                                     }
189 
190                                     chosenGuess = false;
191 
192                                     if (possibleCandidate.length() > 0) {
193                                         chosenMorpho = possibleCandidate;
194                                         chosenLemma = possibleCandidate.split("\\+")[0].split("~")[0];
195                                     } else {
196                                         if (firstCandidate.length() > 0) {
197                                             chosenMorpho = firstCandidate;
198                                             chosenLemma = firstCandidate.split("\\+")[0].split("~")[0];
199                                         } else {
200                                             chosenGuess = true;
201                                             chosenLemma = token.word();
202                                             chosenMorpho = "";
203                                         }
204                                     }
205 
206                                 }
207                             }
208                         }
209                     }
210 
211                     if (isGuessable && chosenGuess && useGuesser) {
212                         GuessModel.Token guess = guesser.guess(token.word(), guessMap.get(pos));
213                         chosenFeaturesString = guess.feats;
214                         chosenFeatures = ConllToken.featureStringToAnnotation(guess.feats);
215                         chosenLemma = guess.lemma;
216                     }
217 
218                     if (!chosenGuess) {
219                         chosenFeaturesString = guesser.getMorphoFeats(chosenMorpho, pos);
220                         if (chosenFeaturesString != null) {
221                             chosenFeatures = ConllToken.featureStringToAnnotation(chosenFeaturesString);
222                         }
223                     }
224 
225                     token.set(CoreAnnotations.LemmaAnnotation.class, chosenLemma);
226                     token.set(DigiMorphAnnotations.SelectedMorphoAnnotation.class, chosenMorpho);
227                     token.set(DigiMorphAnnotations.GuessedLemmaAnnotation.class, chosenGuess);
228                     if (extractFeatures) {
229                         token.set(CoreAnnotations.FeaturesAnnotation.class, chosenFeaturesString);
230                         token.set(CustomAnnotations.FeaturesAnnotation.class, chosenFeatures.asMap());
231                     }
232                 }
233             }
234         }
235 
236     }
237 
238     /**
239      * Returns a set of requirements for which tasks this annotator can
240      * provide.  For example, the POS annotator will return "pos".
241      */
242     @Override
243     public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
244         return Collections.singleton(CoreAnnotations.LemmaAnnotation.class);
245     }
246 
247     /**
248      * Returns the set of tasks which this annotator requires in order
249      * to perform.  For example, the POS annotator will return
250      * "tokenize", "ssplit".
251      */
252     @Override
253     public Set<Class<? extends CoreAnnotation>> requires() {
254         return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
255                 CoreAnnotations.PartOfSpeechAnnotation.class,
256                 DigiMorphAnnotations.MorphoAnnotation.class,
257                 CoreAnnotations.TokensAnnotation.class,
258                 CoreAnnotations.SentencesAnnotation.class
259         )));
260     }
261 }