1   package eu.fbk.dh.tint.resources.pos;
2   
3   import eu.fbk.utils.core.CommandLine;
4   import org.slf4j.Logger;
5   import org.slf4j.LoggerFactory;
6   
7   import java.io.BufferedWriter;
8   import java.io.File;
9   import java.io.FileWriter;
10  import java.nio.file.Files;
11  import java.util.List;
12  import java.util.regex.Matcher;
13  import java.util.regex.Pattern;
14  
15  /**
16   * Created by alessio on 03/05/16.
17   */
18  
19  public class CreateTrainingForStanfordPOS {
20  
21      private static final Logger LOGGER = LoggerFactory.getLogger(CreateTrainingForStanfordPOS.class);
22      private static final int DEFAULT_COL = 3;
23  
24      public static void main(String[] args) {
25  
26          try {
27              final CommandLine cmd = CommandLine
28                      .parser()
29                      .withName("./create-pos-training")
30                      .withHeader("Create training for Stanford POS tagger")
31                      .withOption("i", "input", "Input file", "FILE",
32                              CommandLine.Type.FILE_EXISTING, true, false, true)
33                      .withOption("o", "output", "Output file", "FILE",
34                              CommandLine.Type.FILE_EXISTING, true, false, true)
35                      .withOption("t", "only-tokens", "Output file for tokens", "FILE",
36                              CommandLine.Type.FILE_EXISTING, true, false, false)
37                      .withOption("p", "only-pos", "Output file for pos", "FILE",
38                              CommandLine.Type.FILE_EXISTING, true, false, false)
39                      .withOption("x", "text", "Output text", "FILE",
40                              CommandLine.Type.FILE_EXISTING, true, false, false)
41                      .withOption("c", "conll", "Output in CoNLL format", "FILE",
42                              CommandLine.Type.FILE_EXISTING, true, false, false)
43                      .withOption(null, "column", String.format("Column for POS (default %d)", DEFAULT_COL), "NUM",
44                              CommandLine.Type.INTEGER, true, false, false)
45                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
46  
47              File input = cmd.getOptionValue("input", File.class);
48              File output = cmd.getOptionValue("output", File.class);
49              File onlyTokens = cmd.getOptionValue("only-tokens", File.class);
50              File onlyPos = cmd.getOptionValue("only-pos", File.class);
51              File onlyText = cmd.getOptionValue("text", File.class);
52              File conll = cmd.getOptionValue("conll", File.class);
53  
54              Integer column = cmd.getOptionValue("column", Integer.class, DEFAULT_COL);
55  
56              BufferedWriter writer = new BufferedWriter(new FileWriter(output));
57              BufferedWriter tokensWriter = null;
58              BufferedWriter posWriter = null;
59              BufferedWriter textWriter = null;
60              BufferedWriter conllWriter = null;
61  
62              if (onlyTokens != null) {
63                  tokensWriter = new BufferedWriter(new FileWriter(onlyTokens));
64              }
65              if (onlyPos != null) {
66                  posWriter = new BufferedWriter(new FileWriter(onlyPos));
67              }
68              if (onlyText != null) {
69                  textWriter = new BufferedWriter(new FileWriter(onlyText));
70              }
71              if (conll != null) {
72                  conllWriter = new BufferedWriter(new FileWriter(conll));
73              }
74  
75              List<String> lines = Files.readAllLines(input.toPath());
76              StringBuffer lineBuffer = new StringBuffer();
77  
78              String multiToken = null;
79              String multiLemma = null;
80              StringBuffer multiPos = new StringBuffer();
81              Pattern fromPattern = Pattern.compile("^([0-9]+)");
82              Pattern endPattern = Pattern.compile("([0-9]+)$");
83              Integer from = null;
84              Integer end = null;
85  
86              for (String line : lines) {
87                  line = line.trim();
88  
89                  if (line.startsWith("#")) {
90                      continue;
91                  }
92  
93                  if (line.length() == 0) {
94                      writer.append(lineBuffer.toString().trim());
95                      writer.append("\n");
96                      lineBuffer = new StringBuffer();
97  
98                      if (tokensWriter != null) {
99                          tokensWriter.append("<eos>\n");
100                     }
101                     if (posWriter != null) {
102                         posWriter.append("<eos>\n");
103                     }
104                     if (textWriter != null) {
105                         textWriter.append("\n");
106                     }
107                     if (conllWriter != null) {
108                         conllWriter.append("\n");
109                     }
110 
111                     continue;
112                 }
113 
114                 String[] parts = line.split("\\s+");
115 
116                 String id = parts[0];
117                 String token = parts[1];
118                 String lemma = parts[2];
119                 String pos;
120                 try {
121                     pos = parts[column];
122                 } catch (Exception e) {
123                     LOGGER.error("Invalid column");
124                     break;
125                 }
126                 Integer numericId = null;
127 
128                 if (id.contains("-")) {
129                     multiToken = token;
130                     multiLemma = lemma;
131                     multiPos = new StringBuffer();
132                     Matcher matcher;
133 
134                     matcher = fromPattern.matcher(id);
135                     if (matcher.find()) {
136                         from = Integer.parseInt(matcher.group(1));
137                     }
138                     matcher = endPattern.matcher(id);
139                     if (matcher.find()) {
140                         end = Integer.parseInt(matcher.group(1));
141                     }
142 
143                     continue;
144                 }
145 
146                 numericId = Integer.parseInt(id);
147                 if (end != null && from != null) {
148                     if (numericId <= end || numericId >= from) {
149                         if (multiPos.length() > 0) {
150                             multiPos.append("+");
151                         }
152                         multiPos.append(pos);
153                     }
154 
155                     if (numericId.equals(end)) {
156                         StringBuilder buffer = new StringBuilder();
157                         buffer.append(multiToken);
158                         buffer.append("_");
159                         buffer.append(multiPos.toString());
160                         buffer.append(" ");
161                         lineBuffer.append(buffer.toString());
162                         if (tokensWriter != null) {
163                             tokensWriter.append(multiToken).append("\n");
164                         }
165                         if (posWriter != null) {
166                             posWriter.append(multiPos).append("\n");
167                         }
168                         if (textWriter != null) {
169                             textWriter.append(multiToken).append(" ");
170                         }
171                         if (conllWriter != null) {
172                             conllWriter.append(multiToken).append("\t")
173                                     .append(multiLemma).append("\t")
174                                     .append(multiPos).append("\n");
175                         }
176 
177                         multiPos = new StringBuffer();
178                         multiToken = null;
179                         multiLemma = null;
180                         end = null;
181                         from = null;
182                     }
183 
184                     continue;
185                 }
186 
187                 if (token.equals("_")) {
188                     LOGGER.error("Error in token {}", token);
189                     continue;
190                 }
191 
192                 StringBuffer buffer = new StringBuffer();
193                 buffer.append(token);
194                 buffer.append("_");
195                 buffer.append(pos);
196                 buffer.append(" ");
197                 lineBuffer.append(buffer.toString());
198 
199                 if (tokensWriter != null) {
200                     tokensWriter.append(token).append("\n");
201                 }
202                 if (posWriter != null) {
203                     posWriter.append(pos).append("\n");
204                 }
205                 if (textWriter != null) {
206                     textWriter.append(token).append(" ");
207                 }
208                 if (conllWriter != null) {
209                     conllWriter.append(token).append("\t")
210                             .append(lemma).append("\t")
211                             .append(pos).append("\n");
212                 }
213 
214             }
215 
216             writer.append(lineBuffer.toString().trim());
217             writer.append("\n");
218             if (tokensWriter != null) {
219                 tokensWriter.append("\n");
220             }
221             if (posWriter != null) {
222                 posWriter.append("\n");
223             }
224             if (textWriter != null) {
225                 textWriter.append("\n");
226             }
227             if (conllWriter != null) {
228                 conllWriter.append("\n");
229             }
230 
231             writer.close();
232             if (tokensWriter != null) {
233                 tokensWriter.close();
234             }
235             if (posWriter != null) {
236                 posWriter.close();
237             }
238             if (textWriter != null) {
239                 textWriter.close();
240             }
241             if (conllWriter != null) {
242                 conllWriter.close();
243             }
244 
245         } catch (Exception e) {
246             CommandLine.fail(e);
247         }
248     }
249 }