1   package eu.fbk.dh.tint.resources.morpho;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.utils.core.CommandLine;
6   import org.slf4j.Logger;
7   import org.slf4j.LoggerFactory;
8   
9   import java.io.BufferedWriter;
10  import java.io.File;
11  import java.io.FileWriter;
12  import java.util.HashMap;
13  import java.util.List;
14  import java.util.regex.Matcher;
15  import java.util.regex.Pattern;
16  
17  /**
18   * Created by alessio on 18/05/16.
19   */
20  
21  public class MorphItConverter {
22  
23      private static final Logger LOGGER = LoggerFactory.getLogger(MorphItConverter.class);
24      private static Pattern morphoType = Pattern.compile("^([A-Z-]+):?");
25  
26      static HashMap<String, String> noLemmaTypes = new HashMap<>();
27  
28      static {
29          noLemmaTypes.put("PON", "[PUNCT]");
30          noLemmaTypes.put("SENT", "[PUNCT]");
31          noLemmaTypes.put("SMI", "[SMILE]");
32          noLemmaTypes.put("SYM", "[SYMBOL]");
33      }
34  
35      public static void main(String[] args) {
36          final CommandLine cmd = CommandLine
37                  .parser()
38                  .withName("morphit-converter")
39                  .withHeader("Convert Morph-It dataset to be compliant with fstan")
40                  .withOption("i", "input", "input file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
41                  .withOption("o", "output", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
42                  .withLogger(LoggerFactory.getLogger("eu.fbk.dh")).parse(args);
43  
44          final File inputPath = cmd.getOptionValue("i", File.class);
45          final File outputPath = cmd.getOptionValue("o", File.class);
46  
47          try {
48              BufferedWriter writer = new BufferedWriter(new FileWriter(outputPath));
49  
50              List<String> lines = Files.readLines(inputPath, Charsets.UTF_8);
51              for (String line : lines) {
52                  line = line.trim();
53                  if (line.length() == 0) {
54                      continue;
55                  }
56  
57                  String[] parts = line.split("\\s+");
58                  if (parts.length != 3) {
59                      LOGGER.error("Invalid line: {}", line);
60                      continue;
61                  }
62  
63                  String form = parts[0];
64                  String lemma = parts[1];
65                  String morpho = parts[2];
66  
67                  Matcher matcher = morphoType.matcher(morpho);
68                  if (!matcher.find()) {
69                      LOGGER.warn("Invalid pattern: {}", morpho);
70                      continue;
71                  }
72  
73                  String type = matcher.group(1);
74  
75                  writer.append(form).append(" ");
76                  if (!noLemmaTypes.containsKey(type)) {
77                      writer.append(lemma).append("+");
78                  }
79                  writer.append(morpho).append("\n");
80              }
81  
82              writer.close();
83          } catch (Exception e) {
84              LOGGER.error(e.getMessage());
85          }
86      }
87  }