1   package eu.fbk.dh.tint.digimorph.annotator;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Resources;
5   import com.googlecode.concurrenttrees.radix.ConcurrentRadixTree;
6   import com.googlecode.concurrenttrees.radix.RadixTree;
7   import com.googlecode.concurrenttrees.radix.node.concrete.DefaultCharArrayNodeFactory;
8   import eu.fbk.dh.tint.digimorph.DigiMorph;
9   import eu.fbk.utils.core.FrequencyHashSet;
10  import org.mapdb.SortedTableMap;
11  import org.slf4j.Logger;
12  import org.slf4j.LoggerFactory;
13  
14  import java.io.IOException;
15  import java.net.URL;
16  import java.util.*;
17  
18  public class GuessModel {
19  
20      class Token {
21          String form;
22          String lemma;
23          String feats;
24  
25          @Override
26          public String toString() {
27              return "Token{" +
28                      "form='" + form + '\'' +
29                      ", lemma='" + lemma + '\'' +
30                      ", feats='" + feats + '\'' +
31                      '}';
32          }
33  
34          public Token(String form, String lemma, String feats) {
35              this.form = form;
36              this.lemma = lemma;
37              this.feats = feats;
38          }
39      }
40  
41      private static Set<String> absAdvs = new HashSet<>();
42      static {
43          absAdvs.add("ottimamente");
44          absAdvs.add("pessimamente");
45          absAdvs.add("massimamente");
46      }
47  
48      private static final Logger LOGGER = LoggerFactory.getLogger(GuessModel.class);
49      private HashSet<String> allowedTags = new HashSet<>();
50      private Map<String, RadixTree<LinkedList<String>>> trees = new HashMap<>();
51      private Map<String, String> featMappings = new HashMap<>();
52  
53      public String getMorphoFeatsForContentWords(String featString) {
54          String subToken = featString.replaceAll("^[^~]*~", "");
55          subToken = subToken.replaceAll("^[^+]*\\+", "");
56          return featMappings.get(subToken);
57      }
58  
59      public void addSexMorpho(Set<String> set, String sex) {
60          if (sex.equals("m")) {
61              set.add("Gender=Masc");
62          } else if (sex.equals("f")) {
63              set.add("Gender=Fem");
64          }
65      }
66  
67      public void addNumMorpho(Set<String> set, String num) {
68          if (num.equals("sing")) {
69              set.add("Number=Sing");
70          } else if (num.equals("plur")) {
71              set.add("Number=Plur");
72          }
73      }
74  
75      public void addPersMorpho(Set<String> set, String pers) {
76          if (pers.equals("1") || pers.equals("2") || pers.equals("3")) {
77              set.add("Person=" + pers);
78          }
79      }
80  
81      public void addTypeMorpho(Set<String> set, String type) {
82          if (type.equals("sup")) {
83              set.add("Degree=Abs");
84          }
85          if (type.equals("cmp")) {
86              set.add("Degree=Cmp");
87          }
88      }
89  
90      public String getMorphoFeats(String featString, String pos) {
91          Set<String> featureSet = new TreeSet<>();
92          String[] parts = featString.split("\\+");
93          if (parts.length > 1) {
94              switch (parts[1]) {
95                  case "adj":
96                      addSexMorpho(featureSet, parts[2]);
97                      addNumMorpho(featureSet, parts[3]);
98                      if (parts.length > 4) {
99                          addTypeMorpho(featureSet, parts[4]);
100                     }
101                     break;
102                 case "art":
103                     addSexMorpho(featureSet, parts[2]);
104                     addNumMorpho(featureSet, parts[3]);
105                     break;
106                 case "adv":
107                     // todo: add Cmp for adverbs?
108                     // It seems that it is not used in UD, but maybe it should be.
109                     // see https://it.wikipedia.org/wiki/Gradi_e_alterazioni_degli_avverbi
110                     if (parts[0].endsWith("issimo")) {
111                         featureSet.add("Degree=Abs");
112                     }
113                     if (absAdvs.contains(parts[0].toLowerCase())) {
114                         featureSet.add("Degree=Abs");
115                     }
116                     break;
117                 case "pron":
118                     addSexMorpho(featureSet, parts[3]);
119                     addPersMorpho(featureSet, parts[4]);
120                     addNumMorpho(featureSet, parts[5]);
121                     break;
122             }
123         }
124         switch (pos) {
125             case "A":
126             case "V":
127             case "VA":
128             case "VM":
129             case "S":
130                 return getMorphoFeatsForContentWords(featString);
131             case "AP":
132                 featureSet.add("Poss=Yes");
133                 featureSet.add("PronType=Prs");
134                 break;
135             case "BN":
136                 featureSet.add("PronType=Neg");
137                 break;
138             case "DD":
139                 featureSet.add("PronType=Dem");
140                 break;
141             case "DE":
142                 featureSet.add("PronType=Exc");
143                 break;
144             case "DI":
145                 featureSet.add("PronType=Ind");
146                 break;
147             case "DQ":
148                 featureSet.add("PronType=Int");
149                 break;
150             case "DR":
151                 featureSet.add("PronType=Rel");
152                 break;
153             case "I":
154                 switch (parts[0].toLowerCase()) {
155                     case "si":
156                     case "sì":
157                     case "si'":
158                         featureSet.add("Polarity=Pos");
159                         break;
160                     case "no":
161                         featureSet.add("Polarity=Neg");
162                         break;
163                 }
164                 break;
165             case "N":
166                 featureSet.add("NumType=Card");
167                 break;
168             case "NO":
169                 featureSet.add("NumType=Ord");
170                 break;
171             case "PC":
172                 featureSet.add("Clitic=Yes");
173                 featureSet.add("PronType=Prs");
174                 break;
175             case "PD":
176                 featureSet.add("PronType=Dem");
177                 break;
178             case "PE":
179             case "PP":
180                 featureSet.add("PronType=Prs");
181                 break;
182             case "PI":
183                 featureSet.add("PronType=Ind");
184                 break;
185             case "PQ":
186                 featureSet.add("PronType=Int");
187                 break;
188             case "PR":
189                 featureSet.add("PronType=Rel");
190                 break;
191             case "RD":
192                 featureSet.add("Definite=Def");
193                 featureSet.add("PronType=Art");
194                 break;
195             case "RI":
196                 featureSet.add("Definite=Ind");
197                 featureSet.add("PronType=Art");
198                 break;
199             case "SW":
200                 featureSet.add("Foreign=Yes");
201                 break;
202             case "T":
203                 featureSet.add("PronType=Tot");
204                 break;
205         }
206 
207         StringBuffer buffer = new StringBuffer();
208         int i = 0;
209         for (String s : featureSet) {
210             buffer.append(s);
211             if (++i < featureSet.size()) {
212                 buffer.append("|");
213             }
214         }
215         if (buffer.length() == 0) {
216             buffer.append("_");
217         }
218 
219         return buffer.toString();
220     }
221 
222     public GuessModel() {
223 
224         HashMap<String, String> uMap = new HashMap<>();
225         uMap.put("v", "VERB");
226         uMap.put("adv", "ADV");
227         uMap.put("adj", "ADJ");
228         uMap.put("n", "NOUN");
229         allowedTags.add("VERB");
230         allowedTags.add("NOUN");
231         allowedTags.add("ADJ");
232         allowedTags.add("ADV");
233 
234         URL adjResource = Resources.getResource("feat-mappings.txt");
235         try {
236             for (String line : Resources.readLines(adjResource, Charsets.UTF_8)) {
237                 line = line.trim();
238                 if (line.length() == 0) {
239                     continue;
240                 }
241                 String[] parts = line.split("\\s+");
242                 if (parts.length != 2) {
243                     continue;
244                 }
245                 featMappings.put(parts[0], parts[1]);
246             }
247 
248         } catch (IOException e) {
249             e.printStackTrace();
250         }
251 
252         for (String allowedTag : allowedTags) {
253             trees.put(allowedTag, new ConcurrentRadixTree<>(new DefaultCharArrayNodeFactory()));
254         }
255 
256         DigiMorph digiMorph = new DigiMorph();
257         SortedTableMap<String, String> map = digiMorph.getMap();
258         Iterator<String> gmIterator = map.keyIterator();
259         while (gmIterator.hasNext()) {
260             String key = gmIterator.next();
261             String value = map.get(key).trim();
262             String[] words = value.split("[\\s/]+");
263             for (String word : words) {
264                 String[] parts = word.split("\\+");
265 
266                 if (parts.length < 2) {
267                     continue;
268                 }
269                 String ePos = parts[1];
270                 if (ePos.length() == 0) {
271                     continue;
272                 }
273                 if (!uMap.keySet().contains(ePos)) {
274                     continue;
275                 }
276 
277                 // Feats
278                 String feats = getMorphoFeatsForContentWords(word);
279                 if (feats == null) {
280                     continue;
281                 }
282 
283                 String token = key.toLowerCase();
284                 String lemma = parts[0].toLowerCase();
285                 String reverse_token = new StringBuilder(token).reverse().toString();
286 
287                 LinkedList<String> features = new LinkedList<>();
288                 features.add(token);
289                 features.add(lemma);
290                 features.add(feats);
291 
292                 trees.get(uMap.get(ePos)).put(reverse_token, features);
293             }
294         }
295     }
296 
297     public Token guess(String token, String pos) {
298         String reverse_sample_query = new StringBuilder(token).reverse().toString();
299         FrequencyHashSet<String> values = new FrequencyHashSet<>();
300         Iterable<LinkedList<String>> closestForms = trees.get(pos).getValuesForClosestKeys(reverse_sample_query);
301         for (LinkedList<String> s : closestForms) {
302             values.add(s.get(2));
303         }
304 
305         String guess = values.mostFrequent();
306         String guessed_lemma = token;
307         for (LinkedList<String> closestForm : closestForms) {
308             String feat = closestForm.get(2);
309             if (feat.equals(guess)) {
310                 String lemma = closestForm.get(1).toLowerCase();
311                 String form = closestForm.get(0).toLowerCase();
312 
313                 int min = Math.min(form.length(), lemma.length());
314 
315                 for (int i = 0; i < min; i++) {
316                     char charForm = form.charAt(i);
317                     char charLemma = lemma.charAt(i);
318                     if (charForm != charLemma || i == min - 1) {
319                         String postfix = lemma.substring(i);
320                         int length = token.length() - form.length();
321                         String prefix = token.substring(0, i + length);
322                         guessed_lemma = prefix + postfix;
323                         break;
324                     }
325                 }
326                 break;
327             }
328         }
329 
330         return new Token(token, guessed_lemma, guess);
331     }
332 
333     public static void main(String[] args) {
334         GuessModel model = new GuessModel();
335 //        System.out.println(model.guess("smerdazzi", "NOUN"));
336         System.out.println(model.guess("sparacchio", "VERB"));
337     }
338 }