1   package eu.fbk.dh.tint.eval.ner;
2   
3   import edu.stanford.nlp.stats.MultiClassChunkEvalStats;
4   import eu.fbk.utils.core.CommandLine;
5   import eu.fbk.utils.core.diff_match_patch;
6   import org.slf4j.Logger;
7   import org.slf4j.LoggerFactory;
8   
9   import java.io.BufferedReader;
10  import java.io.File;
11  import java.io.FileReader;
12  import java.util.*;
13  
14  /**
15   * Created by alessio on 20/07/16.
16   */
17  
18  public class TanlEvaluation {
19  
20      private static final Logger LOGGER = LoggerFactory.getLogger(TanlEvaluation.class);
21  
22      public static void main(String[] args) {
23          try {
24              final CommandLine cmd = CommandLine
25                      .parser()
26                      .withName("./evaluate-tanl")
27                      .withHeader("Calculate NER evaluation for Tanl")
28                      .withOption("t", "tanl", "Input file from Tanl", "FILE",
29                              CommandLine.Type.FILE_EXISTING, true, false, true)
30                      .withOption("g", "gold-standard", "Input gold standard file", "FILE",
31                              CommandLine.Type.FILE, true, false, true)
32                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
33  
34              File tanl = cmd.getOptionValue("tanl", File.class);
35              File gold = cmd.getOptionValue("gold-standard", File.class);
36  
37              LinkedHashMap<Integer, String> goldLabels = new LinkedHashMap<>();
38              LinkedHashMap<Integer, String> tanlLabels = new LinkedHashMap<>();
39              HashMap<Integer, Integer> indexMap = new HashMap<>();
40  
41              BufferedReader tReader = new BufferedReader(new FileReader(tanl));
42              BufferedReader gReader = new BufferedReader(new FileReader(gold));
43  
44              String t, g;
45  
46              String line;
47              StringBuilder builder;
48  
49              builder = new StringBuilder();
50              while ((line = tReader.readLine()) != null) {
51                  line = line.trim();
52                  String[] parts = line.split("\t");
53                  if (parts.length < 3) {
54                      continue;
55                  }
56                  String token = parts[0];
57                  String ner = parts[2];
58                  ner = ner.replaceAll("^[A-Za-z]-", "");
59                  if (ner.equals("GPE")) {
60                      ner = "LOC";
61                  }
62                  tanlLabels.put(builder.length(), ner);
63                  builder.append(token);
64              }
65              t = builder.toString();
66  
67              builder = new StringBuilder();
68              while ((line = gReader.readLine()) != null) {
69                  line = line.trim();
70                  String[] parts = line.split("\t");
71                  if (parts.length < 2) {
72                      continue;
73                  }
74                  String token = parts[0];
75                  String ner = parts[1];
76                  goldLabels.put(builder.length(), ner);
77                  builder.append(token);
78              }
79              g = builder.toString();
80  
81              diff_match_patch diffMatchPatch = new diff_match_patch();
82              LinkedList<diff_match_patch.Diff> diffs = diffMatchPatch.diff_main(t, g);
83              diffMatchPatch.diff_cleanupSemanticLossless(diffs);
84  
85              int goldIndex = 0;
86              int tanlIndex = 0;
87              for (diff_match_patch.Diff diff : diffs) {
88                  String text = diff.text;
89                  switch (diff.operation) {
90                  case INSERT:
91                      for (int i = 0; i < text.length(); i++) {
92                          indexMap.put(goldIndex, tanlIndex);
93                          goldIndex++;
94                      }
95                      break;
96                  case DELETE:
97                      tanlIndex += text.length();
98                      break;
99                  case EQUAL:
100                     for (int i = 0; i < text.length(); i++) {
101                         indexMap.put(goldIndex, tanlIndex);
102                         goldIndex++;
103                         tanlIndex++;
104                     }
105                     break;
106                 }
107             }
108 
109             List<String> guesses = new ArrayList<>();
110             List<String> trueLabels = new ArrayList<>();
111 
112             for (Integer key : goldLabels.keySet()) {
113                 String label = goldLabels.get(key);
114                 String ner = "O";
115                 Integer mappedIndex = indexMap.get(key);
116                 if (mappedIndex != null) {
117                     ner = tanlLabels.get(mappedIndex);
118                 }
119                 if (ner == null) {
120                     ner = "O";
121                 }
122 
123                 trueLabels.add(label);
124                 guesses.add(ner);
125             }
126 
127             MultiClassChunkEvalStats stats = new MultiClassChunkEvalStats("O");
128             stats.score(guesses, trueLabels);
129             System.out.println(stats.getConllEvalString());
130 
131             tReader.close();
132             gReader.close();
133         } catch (Exception e) {
134             CommandLine.fail(e);
135         }
136     }
137 }