1   package eu.fbk.dh.tint.resources.parse;
2   
3   import eu.fbk.utils.core.CommandLine;
4   import org.slf4j.Logger;
5   import org.slf4j.LoggerFactory;
6   
7   import java.io.BufferedWriter;
8   import java.io.File;
9   import java.io.FileWriter;
10  import java.io.IOException;
11  import java.nio.file.Files;
12  import java.util.ArrayList;
13  import java.util.HashMap;
14  import java.util.HashSet;
15  import java.util.List;
16  import java.util.regex.Matcher;
17  import java.util.regex.Pattern;
18  
19  /**
20   * Created by alessio on 03/05/16.
21   */
22  
23  public class CreateTrainingForStanfordParser {
24  
25      private static final Logger LOGGER = LoggerFactory.getLogger(CreateTrainingForStanfordParser.class);
26      private static final int DEFAULT_POS_COL = 3;
27  
28      public static void main(String[] args) {
29  
30          try {
31  
32              final CommandLine cmd = CommandLine
33                      .parser()
34                      .withName("./create-parse-training")
35                      .withHeader("Create training for Stanford Parser")
36                      .withOption("i", "input", "Input file", "FILE",
37                              CommandLine.Type.FILE_EXISTING, true, false, true)
38                      .withOption("o", "output", "Output file", "FILE",
39                              CommandLine.Type.FILE_EXISTING, true, false, true)
40                      .withOption(null, "column", String.format("Column for POS (default %d)", DEFAULT_POS_COL), "NUM",
41                              CommandLine.Type.INTEGER, true, false, false)
42                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
43  
44              File input = cmd.getOptionValue("input", File.class);
45              File output = cmd.getOptionValue("output", File.class);
46              Integer column = cmd.getOptionValue("column", Integer.class, DEFAULT_POS_COL);
47  
48              BufferedWriter writer = new BufferedWriter(new FileWriter(output));
49  
50              List<String> lines = Files.readAllLines(input.toPath());
51  
52              ArrayList<HashMap<String, Object>> sentence = new ArrayList<>();
53              HashMap<Integer, Integer> sentenceOffsets = new HashMap<>();
54  
55              String multiToken = null;
56              StringBuffer multiPos = new StringBuffer();
57              String multiParseLabel = null;
58              Integer multiParseParent = null;
59  
60              Pattern fromPattern = Pattern.compile("^([0-9]+)");
61              Pattern endPattern = Pattern.compile("([0-9]+)$");
62              Integer from = null;
63              Integer end = null;
64              HashSet<Integer> internals = new HashSet<>();
65              Integer offset = 0;
66  
67              for (String line : lines) {
68                  line = line.trim();
69  
70                  if (line.startsWith("#")) {
71                      continue;
72                  }
73  
74                  if (line.length() == 0) {
75                      writeSentence(sentence, sentenceOffsets, writer);
76                      sentence = new ArrayList<>();
77                      sentenceOffsets = new HashMap<>();
78  //                    writer.append("\n");
79                      offset = 0;
80                      continue;
81                  }
82  
83                  String[] parts = line.split("\\s+");
84  
85                  String id = parts[0];
86                  String token = parts[1];
87                  String lemma = parts[2];
88                  String pos = parts[column];
89  
90                  Integer parseParent = null;
91                  try {
92                      parseParent = Integer.parseInt(parts[6]);
93                  } catch (Exception e) {
94                      // ignored
95                  }
96                  String parseLabel = parts[7];
97  
98                  Integer numericId = null;
99  
100                 if (id.contains("-")) {
101                     multiToken = token;
102                     multiPos = new StringBuffer();
103 
104                     Matcher matcher;
105 
106                     matcher = fromPattern.matcher(id);
107                     if (matcher.find()) {
108                         from = Integer.parseInt(matcher.group(1));
109                     }
110                     matcher = endPattern.matcher(id);
111                     if (matcher.find()) {
112                         end = Integer.parseInt(matcher.group(1));
113                     }
114 
115                     for (int i = from; i <= end; i++) {
116                         internals.add(i);
117                     }
118 
119                     continue;
120                 }
121 
122                 numericId = Integer.parseInt(id);
123                 if (end != null && from != null) {
124                     if (numericId <= end || numericId >= from) {
125                         if (multiPos.length() > 0) {
126                             multiPos.append("+");
127                         }
128                         multiPos.append(pos);
129                         if (!internals.contains(parseParent) && !parseLabel.equals("det")) {
130                             multiParseLabel = parseLabel;
131                             multiParseParent = parseParent;
132                         }
133                     }
134 
135                     sentenceOffsets.put(numericId, offset + numericId - from);
136 
137                     if (numericId.equals(end)) {
138                         HashMap<String, Object> thisToken = new HashMap<>();
139                         thisToken.put("id", from);
140                         thisToken.put("form", multiToken);
141                         thisToken.put("lemma", multiToken);
142                         thisToken.put("pos", multiPos);
143                         thisToken.put("parseParent", multiParseParent);
144                         thisToken.put("parseLabel", multiParseLabel);
145                         sentence.add(thisToken);
146                         sentenceOffsets.put(from, offset);
147 
148                         multiPos = new StringBuffer();
149                         multiToken = null;
150                         offset += end - from;
151                         end = null;
152                         from = null;
153                         internals = new HashSet<>();
154 
155                     }
156 
157                     continue;
158                 }
159 
160                 if (token.equals("_")) {
161                     LOGGER.error("Error in token {}", token);
162                     continue;
163                 }
164 
165                 HashMap<String, Object> thisToken = new HashMap<>();
166                 thisToken.put("id", Integer.parseInt(id));
167                 thisToken.put("form", token);
168                 thisToken.put("lemma", lemma);
169                 thisToken.put("pos", pos);
170                 thisToken.put("parseParent", parseParent);
171                 thisToken.put("parseLabel", parseLabel);
172                 sentence.add(thisToken);
173                 sentenceOffsets.put(Integer.parseInt(id), offset);
174 
175             }
176 
177             writeSentence(sentence, sentenceOffsets, writer);
178 //            sentence = new ArrayList<>();
179 //            sentenceOffsets = new HashMap<>();
180 
181             writer.close();
182 
183         } catch (Exception e) {
184             CommandLine.fail(e);
185         }
186     }
187 
188     private static void writeSentence(ArrayList<HashMap<String, Object>> sentence,
189             HashMap<Integer, Integer> sentenceOffsets, BufferedWriter writer) throws IOException {
190         if (sentence.size() == 0) {
191             return;
192         }
193 
194         for (HashMap<String, Object> map : sentence) {
195             int id = (int) map.get("id");
196             id -= sentenceOffsets.get(id);
197 
198             int parseParent = (int) map.get("parseParent");
199             if (parseParent != 0) {
200                 try {
201                     parseParent -= sentenceOffsets.get(parseParent);
202                 } catch (Exception e) {
203                     // Fix for UD, bad!
204                     LOGGER.warn("Fix for token _");
205 
206 //                    System.out.println(parseParent);
207 //                    System.out.println(sentence);
208 //                    System.out.println(sentenceOffsets);
209 
210                     parseParent = 11;
211                     parseParent += sentenceOffsets.get(parseParent);
212                 }
213             }
214 
215             writer.append(Integer.toString(id)).append("\t");
216             writer.append(map.get("form").toString()).append("\t");
217             writer.append(map.get("lemma").toString()).append("\t");
218             writer.append(map.get("pos").toString()).append("\t");
219             writer.append(map.get("pos").toString()).append("\t");
220             writer.append("_").append("\t");
221             writer.append(Integer.toString(parseParent)).append("\t");
222             writer.append(map.get("parseLabel").toString()).append("\t");
223             writer.append(Integer.toString(parseParent)).append("\t");
224             writer.append(map.get("parseLabel").toString()).append("\n");
225         }
226 
227         writer.append("\n");
228     }
229 }