1   package eu.fbk.dh.tint.resources.morpho;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.collect.HashMultimap;
5   import com.google.common.io.Files;
6   import eu.fbk.utils.core.CommandLine;
7   import eu.fbk.utils.core.FrequencyHashSet;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  
11  import java.io.BufferedWriter;
12  import java.io.File;
13  import java.io.FileWriter;
14  import java.util.ArrayList;
15  import java.util.HashMap;
16  import java.util.HashSet;
17  import java.util.List;
18  import java.util.regex.Matcher;
19  import java.util.regex.Pattern;
20  
21  /**
22   * Created by alessio on 18/05/16.
23   */
24  
25  public class MorphItEaglesConverter {
26  
27      private static final Logger LOGGER = LoggerFactory.getLogger(MorphItEaglesConverter.class);
28      private static Pattern morphoType = Pattern.compile("^([A-Z0-9-]+):?");
29      private static Pattern fstanPattern = Pattern.compile("([^~]+~)?[^+]+(\\+.*)");
30  
31      static HashMap<String, String> noLemmaTypes = new HashMap<>();
32      static HashMultimap<String, String> manuallyMapped = HashMultimap.create();
33      static HashSet<String> skipTypes = new HashSet<>();
34  
35      static {
36          noLemmaTypes.put("PON", "[PUNCT]");
37          noLemmaTypes.put("SENT", "[PUNCT]");
38          noLemmaTypes.put("SMI", "[SMILE]");
39          noLemmaTypes.put("SYM", "[SYMBOL]");
40          noLemmaTypes.put("ABL", "[ABL]");
41  
42          skipTypes.add("AUX");
43          skipTypes.add("CAU");
44          skipTypes.add("MOD");
45          skipTypes.add("TALE");
46          skipTypes.add("ASP");
47          skipTypes.add("DET-WH");
48          skipTypes.add("CE");
49          skipTypes.add("CI");
50          skipTypes.add("SI");
51          skipTypes.add("NPR");
52          skipTypes.add("WH-CHE");
53          skipTypes.add("ART-M");
54          skipTypes.add("ART-F");
55  
56          manuallyMapped.put("PRO-DEMO-F-P", "+pron+_+m+3+plur+dim");
57          manuallyMapped.put("PRO-PERS-1-M-S", "+pron+nom+_+1+sing+strong");
58          manuallyMapped.put("PRO-PERS-1-M-P", "+pron+nom+_+1+plur+strong");
59          manuallyMapped.put("PRO-PERS-1-F-S", "+pron+nom+_+1+sing+strong");
60          manuallyMapped.put("PRO-PERS-1-F-P", "+pron+nom+_+1+plur+strong");
61          manuallyMapped.put("PRO-PERS-2-F-P", "+pron+nom+_+2+plur+strong");
62          manuallyMapped.put("PRO-PERS-2-M-P", "+pron+nom+_+2+plur+strong");
63          manuallyMapped.put("PRO-PERS-2-F-S", "+pron+nom+_+2+sing+strong");
64          manuallyMapped.put("PRO-PERS-2-M-S", "+pron+nom+_+2+sing+strong");
65          manuallyMapped.put("PRO-PERS-CLI-3-F-P", "+pron+acc+f+3+plur+clit");
66          manuallyMapped.put("PRO-PERS-CLI-3-M-S", "+pron+acc+m+3+sing+clit");
67          manuallyMapped.put("PRO-PERS-CLI-2-F-P", "+pron+acc+f+2+plur+clit");
68          manuallyMapped.put("PRO-PERS-CLI-2-F-S", "+pron+acc+f+2+sing+clit");
69          manuallyMapped.put("PRO-PERS-CLI-1-F-S", "+pron+acc+f+1+sing+clit");
70          manuallyMapped.put("PRO-PERS-CLI-1-F-P", "+pron+acc+f+1+plur+clit");
71          manuallyMapped.put("PRO-PERS-CLI-1-M-P", "+pron+acc+m+1+plur+clit");
72          manuallyMapped.put("PRO-WH-F-P", "+pron+_+_+3+plur+int");
73          manuallyMapped.put("PRO-WH-M-S", "+pron+_+_+3+sing+int");
74          manuallyMapped.put("PRO-WH-F-S", "+pron+_+_+3+sing+int");
75          manuallyMapped.put("PRO-WH-M-P", "+pron+_+_+3+plur+int");
76          manuallyMapped.put("PRO-INDEF-M-P", "+pron+_+m+3+sing+ind");
77          manuallyMapped.put("PRO-POSS-F-P", "+pron+f+plur+pst+poss");
78          manuallyMapped.put("PRO-POSS-F-S", "+pron+f+sing+pst+poss");
79          manuallyMapped.put("PRO-POSS-M-P", "+pron+m+plur+pst+poss");
80          manuallyMapped.put("PRO-POSS-M-S", "+pron+m+sing+pst+poss");
81          manuallyMapped.put("PRO-INDEF-F-P", "+pron+_+f+3+plur+ind");
82          manuallyMapped.put("PRO-INDEF-M-P", "+pron+_+m+3+plur+ind");
83          manuallyMapped.put("PRO-INDEF-F-S", "+pron+_+f+3+sing+ind");
84          manuallyMapped.put("PRO-INDEF-M-S", "+pron+_+m+3+sing+ind");
85  
86          manuallyMapped.put("VER:part+past+p+f+gli", "+v+part+pass+f+nil+plur/gli~pro+pron+dat+_+3+_");
87          manuallyMapped.put("VER:impr+pres+2+p", "+v+imp+pres+nil+2+plur");
88          manuallyMapped.put("VER:impr+pres+1+p", "+v+imp+pres+nil+1+plur");
89          manuallyMapped.put("VER:sub+pres+1+s", "+v+cong+pres+nil+1+sing");
90          manuallyMapped.put("VER:sub+pres+1+p", "+v+cong+pres+nil+1+plur");
91          manuallyMapped.put("VER:sub+pres+3+s", "+v+cong+pres+nil+3+sing");
92          manuallyMapped.put("VER:impr+pres+2+p+veli",
93                  "+v+imp+pres+nil+2+plur/vi~voi+pron+dat+_+2+plur/li~pro+pron+acc+m+3+plur");
94          manuallyMapped.put("VER:impr+pres+2+s+celo",
95                  "+v+imp+pres+nil+2+sing/ci~pro+pron+dat+_+1+plur/lo~pro+pron+acc+m+3+sing");
96          manuallyMapped
97                  .put("VER:impr+pres+2+s+celo", "+v+imp+pres+nil+2+sing/ci~loc+pron+loc+_+3+_/lo~pro+pron+acc+m+3+sing");
98          manuallyMapped
99                  .put("VER:impr+pres+2+s+celi", "+v+imp+pres+nil+2+sing/ci~loc+pron+loc+_+3+_/lo~pro+pron+acc+m+3+plur");
100         manuallyMapped
101                 .put("VER:impr+pres+2+p+celi", "+v+imp+pres+nil+2+plur/ci~loc+pron+loc+_+3+_/lo~pro+pron+acc+m+3+plur");
102         manuallyMapped.put("VER:impr+pres+2+s+mele",
103                 "+v+imp+pres+nil+2+sing/mi~io+pron+dat+_+1+sing/le~pro+pron+acc+f+3+plur");
104         manuallyMapped.put("VER:part+pres+s+m", "+v+part+pres+nil+nil+sing");
105         manuallyMapped.put("VER:part+past+s+f+ne", "+v+part+pass+f+nil+sing/ne~part+pron+gen+_+3+_");
106         manuallyMapped.put("VER:part+past+s+f+gli", "+v+part+pass+f+nil+sing/gli~pro+pron+dat+_+3+_");
107         manuallyMapped.put("VER:ger+pres+vene",
108                 "+v+gerundio+pres+nil+nil+nil/vi~voi+pron+dat+_+2+plur/ne~part+pron+gen+_+3+_");
109         manuallyMapped.put("VER:ger+pres+celo",
110                 "+v+gerundio+pres+nil+nil+nil/ci~pro+pron+dat+_+1+plur/lo~pro+pron+acc+m+3+sing");
111         manuallyMapped.put("VER:ger+pres+celi",
112                 "+v+gerundio+pres+nil+nil+nil/ci~pro+pron+dat+_+1+plur/lo~pro+pron+acc+m+3+plur");
113         manuallyMapped.put("VER:ger+pres+cela",
114                 "+v+gerundio+pres+nil+nil+nil/ci~pro+pron+dat+_+1+plur/lo~pro+pron+acc+f+3+sing");
115         manuallyMapped.put("VER:part+pres+p+f", "+v+part+pres+nil+nil+plur");
116         manuallyMapped.put("VER:sub+impf+2+s", "+v+cong+imperf+nil+2+sing");
117         manuallyMapped.put("VER:inf+pres+vele",
118                 "+v+gerundio+pres+nil+nil+nil/vi~voi+pron+dat+_+2+plur/le~pro+pron+acc+f+3+plur");
119 
120         manuallyMapped.put("ADJ:comp+f+p", "+adj+_+plur+pst");
121         manuallyMapped.put("ADJ:comp+f+s", "+adj+_+sing+pst");
122         manuallyMapped.put("ADJ:comp+m+s", "+adj+_+sing+pst");
123         manuallyMapped.put("ADJ:comp+m+p", "+adj+_+plur+pst");
124 
125         manuallyMapped.put("DET-NUM-CARD", "+adj+_+_+pst+num");
126         manuallyMapped.put("DET-POSS:m+s", "+adj+m+sing+pst+poss");
127         manuallyMapped.put("DET-POSS:m+p", "+adj+m+plur+pst+poss");
128     }
129 
130     public static void main(String[] args) {
131         final CommandLine cmd = CommandLine
132                 .parser()
133                 .withName("morphit-converter")
134                 .withHeader("Convert Morph-It dataset to be compliant with fstan")
135                 .withOption("i", "input", "input file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
136                 .withOption("o", "output", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
137                 .withOption("f", "fstan-command", "fstan TextPro command", "COMMAND", CommandLine.Type.FILE_EXISTING,
138                         true, false, true)
139                 .withOption("m", "fstan-model", "fstan TextPro model", "MODEL FILE", CommandLine.Type.FILE_EXISTING,
140                         true, false, true)
141                 .withOption(null, "no-fstan", "Do not use fstan for initial population")
142                 .withOption(null, "use-spaces", "Use spaces (instead of tabs) as separator")
143                 .withLogger(LoggerFactory.getLogger("eu.fbk.dh")).parse(args);
144 
145         final File inputPath = cmd.getOptionValue("i", File.class);
146         final File outputPath = cmd.getOptionValue("o", File.class);
147 
148         final String fstanCommand = cmd.getOptionValue("f", String.class);
149         final String fstanModel = cmd.getOptionValue("m", String.class);
150 
151         boolean useFstan = !cmd.hasOption("no-fstan");
152         boolean useSpaces = cmd.hasOption("use-spaces");
153 
154         char separator = '\t';
155         if (useSpaces) {
156             separator = ' ';
157         }
158 
159         FstanRunner runner = new FstanRunner(fstanCommand, fstanModel);
160         HashSet<String> buffer = new HashSet<>();
161 
162         HashSet<String> allForms = new HashSet<>();
163 
164         try {
165             BufferedWriter writer = new BufferedWriter(new FileWriter(outputPath));
166             HashSet<String> types = new HashSet<>();
167 
168             HashMultimap<String, String> typeToForm = HashMultimap.create();
169             FrequencyHashSet<String> formMeanings = new FrequencyHashSet<>();
170 
171             List<String> lines = Files.readLines(inputPath, Charsets.ISO_8859_1);
172             for (String line : lines) {
173                 line = line.trim();
174                 if (line.length() == 0) {
175                     continue;
176                 }
177 
178                 String[] parts = line.split("\\s+");
179                 if (parts.length != 3) {
180                     LOGGER.error("Invalid line: {}", line);
181                     continue;
182                 }
183 
184                 String form = parts[0];
185                 String lemma = parts[1];
186                 String morpho = parts[2];
187 
188                 Matcher matcher = morphoType.matcher(morpho);
189                 if (!matcher.find()) {
190                     LOGGER.warn("Invalid pattern: {}", morpho);
191                     continue;
192                 }
193 
194                 String type = matcher.group(1);
195                 if (noLemmaTypes.containsKey(type)) {
196                     continue;
197                 }
198 
199                 allForms.add(form);
200 
201                 if (skipTypes.contains(type)) {
202                     continue;
203                 }
204                 if (manuallyMapped.keys().contains(morpho)) {
205                     continue;
206                 }
207 
208                 types.add(morpho);
209                 typeToForm.put(morpho, form);
210                 formMeanings.add(form);
211             }
212 
213             for (String form : formMeanings.keySet()) {
214                 if (formMeanings.get(form) == 1) {
215                     buffer.add(form);
216                 }
217             }
218 
219             HashMap<String, String> mappings = new HashMap<>();
220 
221             for (String type : typeToForm.keySet()) {
222 
223                 HashSet<String> nonAmbiguous = new HashSet<>();
224                 for (String form : typeToForm.get(type)) {
225                     if (buffer.contains(form)) {
226                         nonAmbiguous.add(form);
227                     }
228 
229                     if (nonAmbiguous.size() > 1000) {
230                         break;
231                     }
232                 }
233 
234                 ArrayList<String> toFstan = new ArrayList<>(nonAmbiguous);
235                 ArrayList<String[]> run = runner.run(toFstan);
236 
237                 FrequencyHashSet<String> fstanForms = new FrequencyHashSet<>();
238 
239                 for (int i = 0; i < toFstan.size(); i++) {
240                     String[] res = run.get(i);
241 
242                     if (res.length == 1) {
243                         continue;
244                     }
245 
246                     for (int j = 1; j < res.length; j++) {
247                         String okForm = res[j];
248                         Matcher matcher = fstanPattern.matcher(okForm);
249                         if (!matcher.find()) {
250                             LOGGER.error("Error in form: {}", okForm);
251                             continue;
252                         }
253 
254                         String suffix = matcher.group(2);
255                         fstanForms.add(suffix);
256                     }
257                 }
258 
259                 if (fstanForms.size() == 0) {
260                     LOGGER.warn("No forms for: {}", type);
261                     continue;
262                 }
263 
264                 String mf = fstanForms.mostFrequent();
265 
266                 mappings.put(type, mf);
267             }
268 
269             ArrayList<String> allFormsArray = new ArrayList<>(allForms);
270             HashMultimap<String, String> fstanMorpho = HashMultimap.create();
271 
272             if (useFstan) {
273                 LOGGER.info("Running fstan");
274                 ArrayList<String[]> run = runner.run(allFormsArray);
275                 for (int i = 0; i < allFormsArray.size(); i++) {
276                     String form = allFormsArray.get(i);
277                     String[] morphos = run.get(i);
278                     if (morphos.length > 1) {
279                         for (int j = 1; j < morphos.length; j++) {
280                             String morpho = morphos[j];
281                             fstanMorpho.put(form, morpho);
282                         }
283                     }
284                 }
285             }
286 
287             LOGGER.info("Adding unknown forms");
288             lines = Files.readLines(inputPath, Charsets.ISO_8859_1);
289             for (String line : lines) {
290                 line = line.trim();
291                 if (line.length() == 0) {
292                     continue;
293                 }
294 
295                 String[] parts = line.split("\\s+");
296                 if (parts.length != 3) {
297                     LOGGER.error("Invalid line: {}", line);
298                     continue;
299                 }
300 
301                 String form = parts[0];
302                 String lemma = parts[1];
303                 if (fstanMorpho.containsKey(form)) {
304                     for (String s : fstanMorpho.get(form)) {
305                         writer.append(form).append(separator)
306                                 .append(lemma).append(separator)
307                                 .append(s).append("\n");
308                     }
309                     continue;
310                 }
311 
312                 String morpho = parts[2];
313 
314                 Matcher matcher = morphoType.matcher(morpho);
315                 if (!matcher.find()) {
316                     LOGGER.warn("Invalid pattern: {}", morpho);
317                     continue;
318                 }
319 
320                 String type = matcher.group(1);
321                 if (skipTypes.contains(type)) {
322                     continue;
323                 }
324                 if (noLemmaTypes.containsKey(type)) {
325                     writer.append(form).append(separator)
326                             .append(lemma).append(separator)
327                             .append(noLemmaTypes.get(type)).append("\n");
328                     continue;
329                 }
330 
331                 String finalMorpho = null;
332 
333                 if (manuallyMapped.keys().contains(morpho)) {
334                     for (String m : manuallyMapped.get(morpho)) {
335                         finalMorpho = lemma + m;
336                     }
337                 }
338 
339                 if (finalMorpho == null) {
340                     String eaglesMorpho = mappings.get(morpho);
341                     if (eaglesMorpho == null) {
342                         LOGGER.error(morpho);
343                         System.exit(1);
344                     }
345 
346                     finalMorpho = lemma + eaglesMorpho;
347                 }
348 
349                 writer.append(form).append(separator)
350                         .append(lemma).append(separator);
351                 if (finalMorpho.contains("/")) {
352                     writer.append(lemma).append("~");
353                 }
354                 writer.append(finalMorpho).append("\n");
355             }
356 
357             writer.append("il").append(separator).append("il").append(separator).append("il+art+m+sing").append("\n");
358             writer.append("lo").append(separator).append("il").append(separator).append("il+art+m+sing").append("\n");
359             writer.append("la").append(separator).append("la").append(separator).append("la+art+f+sing").append("\n");
360             writer.append("i").append(separator).append("il").append(separator).append("il+art+m+plur").append("\n");
361             writer.append("gli").append(separator).append("il").append(separator).append("il+art+m+plur").append("\n");
362             writer.append("le").append(separator).append("la").append(separator).append("la+art+f+plur").append("\n");
363 
364             writer.append("un").append(separator).append("un").append(separator).append("un+art+m+sing").append("\n");
365             writer.append("uno").append(separator).append("un").append(separator).append("un+art+m+sing").append("\n");
366             writer.append("una").append(separator).append("una").append(separator).append("una+art+f+sing").append("\n");
367 
368             writer.close();
369         } catch (Exception e) {
370 //            logger.error(e.getMessage());
371             e.printStackTrace();
372         }
373     }
374 }