1   package eu.fbk.dh.tint.resources.pos;
2   
3   import org.slf4j.Logger;
4   import org.slf4j.LoggerFactory;
5   
6   import java.io.BufferedWriter;
7   import java.io.File;
8   import java.io.FileWriter;
9   import java.nio.file.Files;
10  import java.util.List;
11  
12  /**
13   * Created by alessio on 03/05/16.
14   */
15  
16  public class CreateTrainingForStanfordPOS2 {
17  
18      private static final Logger LOGGER = LoggerFactory.getLogger(CreateTrainingForStanfordPOS2.class);
19  
20      /*
21      Tag set: [FF, DD, PP, A, NO, DE, PQ, PR, B, B+PC, E, DI, I, VA+PC, BN, FS, DQ, PC+PC, N, DR, S, T, .$$., V, E+RD, VM+PC, X, SP, CC, SW, V+PC+PC, VA, AP, V+PC, CS, RD, PC, PD, PE, RI, VM, PI, FB, VM+PC+PC, FC]
22      */
23  
24      public static void main(String[] args) {
25  //        String input = args[0];
26  //        String output = args[1];
27  
28          String input = "/Users/alessio/Documents/Resources/universal_treebanks_v2.0/std/it/it-universal-test.conll";
29          String output = "/Users/alessio/Documents/Resources/universal_treebanks_v2.0/std/it/it-universal-test.conll.stanford";
30  
31          try {
32              BufferedWriter writer = new BufferedWriter(new FileWriter(output));
33  
34              List<String> lines = Files.readAllLines((new File(input)).toPath());
35              StringBuffer lineBuffer = new StringBuffer();
36              for (String line : lines) {
37                  String[] parts = line.split("\\s+");
38                  if (parts.length < 5) {
39                      writer.append(lineBuffer.toString().trim());
40                      writer.append("\n");
41                      lineBuffer = new StringBuffer();
42                      continue;
43                  }
44                  String token = parts[1];
45                  String pos = parts[3];
46                  if (!parts[4].equals(parts[3])) {
47                      switch (parts[4]) {
48                      case "AUX":
49                          pos += "-AUX";
50                          break;
51                      case "PNOUN":
52                          pos = parts[4];
53                          break;
54                      default:
55                          LOGGER.error("Error in POS: {}", parts[4]);
56                      }
57                  }
58  
59                  if (token.equals("_")) {
60                      LOGGER.error("Error in token {}", token);
61                      continue;
62                  }
63  
64                  StringBuffer buffer = new StringBuffer();
65                  buffer.append(token);
66                  buffer.append("_");
67                  buffer.append(pos);
68                  buffer.append(" ");
69                  lineBuffer.append(buffer.toString());
70              }
71  
72              writer.append(lineBuffer.toString().trim());
73              writer.append("\n");
74  
75              writer.close();
76  
77          } catch (Exception e) {
78              e.printStackTrace();
79          }
80      }
81  }