1   package eu.fbk.dh.tint.resources.ner;
2   
3   import eu.fbk.utils.core.CommandLine;
4   import org.slf4j.Logger;
5   import org.slf4j.LoggerFactory;
6   
7   import java.io.*;
8   import java.nio.charset.Charset;
9   
10  /**
11   * Created by alessio on 20/07/16.
12   */
13  
14  public class ConvertICAB {
15  
16      private static final Logger LOGGER = LoggerFactory.getLogger(ConvertICAB.class);
17  
18      public static void main(String[] args) {
19          try {
20              final CommandLine cmd = CommandLine
21                      .parser()
22                      .withName("./convert-icab")
23                      .withHeader("Convert I-CAB dataset for Stanford training")
24                      .withOption("i", "input", "Input training/test file in IOB2 format", "FILE",
25                              CommandLine.Type.FILE_EXISTING, true, false, true)
26                      .withOption("o", "output-stanford", "Output file for Stanford", "FILE",
27                              CommandLine.Type.FILE, true, false, true)
28                      .withOption("t", "output-text", "Output file text only", "FILE",
29                              CommandLine.Type.FILE, true, false, false)
30                      .withOption("k", "output-text-br", "Output file one-token-per-line", "FILE",
31                              CommandLine.Type.FILE, true, false, false)
32                      .withOption("g", "keep-gpe", "Keep GPE tags (default is to remove them)")
33                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
34  
35              File input = cmd.getOptionValue("input", File.class);
36              File output = cmd.getOptionValue("output-stanford", File.class);
37              File textOut = cmd.getOptionValue("output-text", File.class);
38              File textTok = cmd.getOptionValue("output-text-br", File.class);
39  
40              boolean keepGpe = cmd.hasOption("keep-gpe");
41  
42              BufferedWriter writer = new BufferedWriter(new FileWriter(output));
43              BufferedReader reader = new BufferedReader(
44                      new InputStreamReader(new FileInputStream(input), Charset.forName("ISO-8859-1")));
45              BufferedWriter textWriter = null;
46              BufferedWriter textTokWriter = null;
47              if (textOut != null) {
48                  textWriter = new BufferedWriter(new FileWriter(textOut));
49              }
50              if (textTok != null) {
51                  textTokWriter = new BufferedWriter(new FileWriter(textTok));
52              }
53  
54              String line;
55              while ((line = reader.readLine()) != null) {
56                  line = line.trim();
57  
58                  if (line.length() == 0) {
59                      if (textWriter != null) {
60                          textWriter.write("\n");
61                      }
62                      if (textTokWriter != null) {
63                          textTokWriter.write("\n");
64                      }
65                      writer.write("\n");
66                      continue;
67                  }
68  
69                  String[] parts = line.split("\\s+");
70                  if (parts.length < 2) {
71                      continue;
72                  }
73  
74                  String token = parts[0];
75                  String ner = parts[parts.length - 1];
76                  ner = ner.replaceAll("^[A-Za-z]-", "");
77                  if (!keepGpe && ner.equals("GPE")) {
78                      ner = "LOC";
79                  }
80  
81                  writer.append(token).append("\t").append(ner).append("\n");
82                  if (textWriter != null) {
83                      textWriter.append(token).append(" ");
84                  }
85                  if (textTokWriter != null) {
86                      textTokWriter.append(token).append("\n");
87                  }
88              }
89  
90              if (textWriter != null) {
91                  textWriter.close();
92              }
93              if (textTokWriter != null) {
94                  textTokWriter.close();
95              }
96              reader.close();
97              writer.close();
98          } catch (Exception e) {
99              CommandLine.fail(e);
100         }
101     }
102 }