1   package eu.fbk.dh.tint.simplifier;
2   
3   import edu.stanford.nlp.ling.CoreAnnotations;
4   import edu.stanford.nlp.ling.CoreLabel;
5   import edu.stanford.nlp.ling.IndexedWord;
6   import edu.stanford.nlp.pipeline.Annotation;
7   import edu.stanford.nlp.semgraph.SemanticGraph;
8   import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
9   import edu.stanford.nlp.semgraph.SemanticGraphEdge;
10  import edu.stanford.nlp.util.CoreMap;
11  import eu.fbk.dh.tint.runner.TintPipeline;
12  import eu.fbk.utils.core.PropertiesUtils;
13  import org.slf4j.Logger;
14  import org.slf4j.LoggerFactory;
15  import org.w3c.dom.Document;
16  import org.w3c.dom.Element;
17  import org.w3c.dom.Node;
18  import org.w3c.dom.NodeList;
19  
20  import javax.xml.parsers.DocumentBuilder;
21  import javax.xml.parsers.DocumentBuilderFactory;
22  import javax.xml.xpath.XPath;
23  import javax.xml.xpath.XPathConstants;
24  import javax.xml.xpath.XPathExpression;
25  import javax.xml.xpath.XPathFactory;
26  import java.io.BufferedWriter;
27  import java.io.File;
28  import java.io.FileWriter;
29  import java.util.*;
30  
31  import static eu.fbk.dh.tint.simplifier.Simplifier.*;
32  
33  public class Test {
34  
35      static String PREPOSITION_PREFIX = "E";
36      static String ADVERB_PREFIX = "B";
37      static String SUBJ_RELATION = "nsubj";
38      private static final Logger LOGGER = LoggerFactory.getLogger(Test.class);
39  
40      static Set<String> prepositions = new HashSet<>();
41      static Set<String> agnosticBegins = new HashSet<>();
42  
43      static {
44          prepositions.add("dopo");
45          agnosticBegins.add("sulla base");
46          agnosticBegins.add("solo in caso di");
47      }
48  
49      abstract static class Action {
50  
51          Annotation annotation;
52  
53          abstract String apply(String text, int[] conversionTable);
54  
55  //        protected Integer getValue(Integer id, Map<Integer, Integer> conversionTable) {
56  //            return conversionTable.getOrDefault(id, id);
57  //        }
58  
59          public Action(Annotation annotation) {
60              this.annotation = annotation;
61          }
62      }
63  
64      static class Remove extends Action {
65  
66          Integer originalStart;
67          Integer originalEnd;
68          boolean checkSpaceInside = true;
69  
70          public Remove(Annotation annotation, Integer originalStart, Integer originalEnd) {
71              super(annotation);
72              this.originalStart = originalStart;
73              this.originalEnd = originalEnd;
74          }
75  
76          @Override public String toString() {
77              return "Remove{" +
78                      "originalStart=" + originalStart +
79                      ", originalEnd=" + originalEnd +
80                      '}';
81          }
82  
83          @Override String apply(String text, int[] conversionTable) {
84  //            System.out.println("DELETE");
85  //            System.out.println(text);
86  //            System.out.println(conversionTable);
87  //            System.out.println(originalStart);
88  //            System.out.println(originalEnd);
89              Integer start = conversionTable[originalStart];
90              Integer end = conversionTable[originalEnd];
91              for (int i = originalStart; i < Math.min(originalEnd, conversionTable.length); i++) {
92                  conversionTable[i] = start;
93              }
94              for (int i = originalEnd; i < conversionTable.length; i++) {
95                  conversionTable[i] -= end - start;
96              }
97              StringBuffer buffer = new StringBuffer();
98              buffer.append(text.substring(0, start));
99              buffer.append(text.substring(end));
100             return buffer.toString();
101         }
102     }
103 
104     static class Insert extends Action {
105 
106         Integer originalStart;
107         String textToInsert;
108         boolean checkSpaceBefore = false;
109         boolean checkSpaceAfter = true;
110 
111         public Insert(Annotation annotation, Integer originalStart, String textToInsert) {
112             super(annotation);
113             this.originalStart = originalStart;
114             this.textToInsert = textToInsert;
115         }
116 
117         @Override public String toString() {
118             return "Insert{" +
119                     "originalStart=" + originalStart +
120                     ", textToInsert='" + textToInsert + '\'' +
121                     '}';
122         }
123 
124         @Override String apply(String text, int[] conversionTable) {
125 //            System.out.println("INSERT");
126 //            System.out.println(text);
127 //            System.out.println(conversionTable);
128 //            System.out.println(originalStart);
129 //            System.out.println(textToInsert);
130             StringBuffer buffer = new StringBuffer();
131             Integer start = conversionTable[originalStart];
132             for (int i = start; i < conversionTable.length; i++) {
133                 conversionTable[i] += textToInsert.length();
134             }
135             buffer.append(text.substring(0, start));
136             buffer.append(textToInsert);
137             buffer.append(text.substring(start));
138             return buffer.toString();
139         }
140     }
141 
142     public static String complexString(Annotation annotation) throws Exception {
143         String originalText = annotation.get(CoreAnnotations.TextAnnotation.class);
144         int offset = 0;
145 //        StringBuffer stringBuffer = new StringBuffer();
146 
147         for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
148 
149             // Looking for commas
150             List<Integer> commas = new ArrayList<>();
151             List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
152             for (CoreLabel token : tokens) {
153                 if (token.originalText().equals(",")) {
154                     commas.add(token.index());
155                 }
156             }
157 
158             SemanticGraph semanticGraph = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
159 //            System.out.println(semanticGraph);
160 
161             // Collecting nsubj relations
162             // Warning: the key MUST be lower than the value
163             Map<Integer, Integer> subjects = new HashMap<>();
164             for (SemanticGraphEdge edge : semanticGraph.edgeListSorted()) {
165                 String relation = edge.getRelation().getShortName();
166                 if (!relation.equals(SUBJ_RELATION)) {
167                     continue;
168                 }
169 
170                 int index1 = edge.getDependent().index();
171                 int index2 = edge.getGovernor().index();
172                 subjects.put(Math.min(index1, index2), Math.max(index1, index2));
173             }
174 
175 //                Map<Integer, Integer> removes = new HashMap<>();
176 //                Map<Integer, String> inserts = new HashMap<>();
177 
178             for (int i1 = 0, commasSize1 = commas.size(); i1 < commasSize1 - 1; i1++) {
179                 List<Action> actions = new ArrayList<>();
180 
181                 Integer comma1 = commas.get(i1);
182                 Integer comma2 = commas.get(i1 + 1);
183 
184                 StringBuffer buffer = new StringBuffer();
185                 Integer okStart = 0; // for adverbs
186 
187                 Set<IndexedWord> parents = new HashSet<>();
188                 Set<IndexedWord> children = new HashSet<>();
189                 for (int i = comma1 + 1; i < comma2; i++) {
190                     IndexedWord node = semanticGraph.getNodeByIndex(i);
191                     parents.addAll(getParents(semanticGraph, node));
192                     children.addAll(getChildren(semanticGraph, node));
193                     CoreLabel token = sentence.get(CoreAnnotations.TokensAnnotation.class).get(i - 1);
194                     buffer.append(token.originalText()).append(" ");
195                 }
196                 String sentenceText = buffer.toString().trim();
197 
198                 for (int i = comma1; i <= comma2; i++) {
199                     IndexedWord node = semanticGraph.getNodeByIndex(i);
200                     children.remove(node);
201                     parents.remove(node);
202                 }
203 
204                 // Rule: children inside the set, only one parent outside the set
205                 // Warning: if this rule changes, check the code below
206                 if (parents.size() != 1 || children.size() != 0) {
207                     continue;
208                 }
209 
210                 // Rule: the whole set is included in a subject-verb pair
211                 Integer included = null;
212                 for (Integer key : subjects.keySet()) {
213                     Integer value = subjects.get(key);
214                     if (comma1 > key && comma2 < value) {
215                         included = key;
216                         break;
217                     }
218                 }
219                 if (included == null) {
220                     continue;
221                 }
222 
223                 // Rule: check how the part begins
224                 boolean keepForWord = false;
225                 String okText = sentenceText.substring(okStart).trim();
226                 for (String preposition : prepositions) {
227                     // todo: check end of token
228                     if (okText.toLowerCase().startsWith(preposition.toLowerCase() + " ")) {
229                         keepForWord = true;
230                     }
231                 }
232                 for (String agnosticBegin : agnosticBegins) {
233                     // todo: check end of token
234                     if (okText.toLowerCase().startsWith(agnosticBegin.toLowerCase() + " ")) {
235                         keepForWord = true;
236                     }
237                 }
238 
239                 if (!keepForWord) {
240                     continue;
241                 }
242 
243                 // There should be only one word in parents
244                 IndexedWord parent = parents.iterator().next();
245 
246                 List<IndexedWord> allChildren = getChildrenRecursive(semanticGraph, parent);
247                 if (allChildren.size() == 0) {
248                     continue;
249                 }
250 
251                 Integer firstIndex = -1;
252                 for (IndexedWord child : allChildren) {
253                     int index = child.index();
254                     if (firstIndex == -1 || firstIndex > index) {
255                         firstIndex = index;
256                     }
257                 }
258 
259                 // Need to remove this part
260                 int removeBegin = sentence.get(CoreAnnotations.TokensAnnotation.class).get(comma1 - 1).beginPosition();
261                 int removeEnd = sentence.get(CoreAnnotations.TokensAnnotation.class).get(comma2 - 1).endPosition();
262                 actions.add(new Remove(annotation, removeBegin, removeEnd));
263 
264                 // Need to add this part
265                 int insertBegin = sentence.get(CoreAnnotations.TokensAnnotation.class).get(firstIndex - 1).beginPosition();
266                 actions.add(new Insert(annotation, insertBegin, sentenceText + ", "));
267 
268 //                    System.out.println(buffer.toString());
269 //                    System.out.println(firstIndex);
270 //
271 //                    System.out.println(allChildren);
272 //                    System.out.println(comma1);
273 //                    System.out.println(comma2);
274 //                    System.out.println(parents);
275 //                    System.out.println(children);
276 //                    System.out.println(included);
277 
278                 String text = sentence.get(CoreAnnotations.TextAnnotation.class);
279                 int[] conversionTable = new int[text.length()];
280                 for (int i = 0; i < text.length(); i++) {
281                     conversionTable[i] = i;
282                 }
283                 for (Action action : actions) {
284                     text = action.apply(text, conversionTable);
285                 }
286 
287                 Integer begin = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
288                 Integer end = sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
289 
290                 StringBuffer stringBuffer = new StringBuffer();
291                 stringBuffer.append(originalText.substring(0, begin + offset));
292                 stringBuffer.append(text);
293                 stringBuffer.append(originalText.substring(end + offset));
294                 originalText = stringBuffer.toString().trim();
295 
296                 offset += text.length() - (end - begin);
297 
298 //                System.out.println(sentenceText);
299 //                System.out.println(text);
300 //                System.out.println(Arrays.toString(conversionTable));
301 
302             }
303 
304         }
305 
306         return originalText;
307     }
308 
309     public static void main(String[] args) {
310 
311 //        String inputFile = "/Users/alessio/Documents/SIMPATICO/sintattico/simpitiki-syntax.xml";
312 //        String outFile = "/Users/alessio/Documents/SIMPATICO/sintattico/simpitiki-sentences.txt";
313 
314         String inputFile = args[0];
315         String outFile = args[1];
316 
317         /*
318 
319         Before running:
320         - create XML adding <simplification> and <before> tags
321         - sed -ie 's~&~\&amp;~g' [file]
322         - perl -CSDA -pe's/[^\x9\xA\xD\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]+//g;' [source] > [dest]
323 
324         Next commands:
325         - transform for Lex -> cut -f4 [origin] > [destination]
326         - run the Python stuff
327         - paste the files -> paste [destination] [destination-parsed] > [final-file]
328          */
329 
330         try {
331 
332             BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
333 
334             DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
335             DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
336             XPathFactory xPathfactory = XPathFactory.newInstance();
337             XPath xpath = xPathfactory.newXPath();
338             XPathExpression expr;
339             NodeList nl;
340             Document doc = dBuilder.parse(new File(inputFile));
341             int totalSimplified = 0;
342             int totalSentences = 0;
343 
344             TintPipeline pipeline = new TintPipeline();
345             pipeline.loadDefaultProperties();
346             pipeline.setProperty("annotators", "ita_toksent, pos, ita_morpho, ita_lemma, depparse, fake_dep");
347 
348             pipeline.load();
349 
350             expr = xpath.compile("/root/simplification");
351             nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
352             for (int i = 0; i < nl.getLength(); i++) {
353                 totalSentences++;
354                 Node item = nl.item(i);
355                 Element element = (Element) item;
356 
357                 String tbsTemp = element.getAttribute("toBeSimplified");
358                 if (tbsTemp == null || tbsTemp.length() == 0) {
359                     tbsTemp = "1";
360                 }
361                 boolean toBeSimplified = PropertiesUtils.getBoolean(tbsTemp, true);
362 
363                 expr = xpath.compile("before");
364                 NodeList beforeList = (NodeList) expr.evaluate(item, XPathConstants.NODESET);
365 //                expr = xpath.compile("after");
366 //                NodeList afterList = (NodeList) expr.evaluate(item, XPathConstants.NODESET);
367 
368                 Node before = beforeList.item(0);
369 //                Node after = afterList.item(0);
370 
371                 String text1 = before.getTextContent();
372 //                String text2 = after.getTextContent();
373 
374                 Annotation annotation = pipeline.runRaw(text1);
375 
376 //                Map<Integer, HashMultimap<Integer, Integer>> children = new HashMap<>();
377 //                List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
378 //                for (int sentIndex = 0; sentIndex < sentences.size(); sentIndex++) {
379 //                    CoreMap sentence = sentences.get(sentIndex);
380 //
381 //                    children.put(sentIndex, HashMultimap.create());
382 //
383 //                    SemanticGraph semanticGraph = sentence
384 //                            .get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
385 //                    Collection<IndexedWord> rootNodes = semanticGraph.getRoots();
386 //                    if (rootNodes.isEmpty()) {
387 //                        continue;
388 //                    }
389 //
390 //                    for (IndexedWord root : rootNodes) {
391 //                        Set<Integer> stack = new HashSet<>();
392 //                        Set<IndexedWord> used = new HashSet<>();
393 //                        addChildren(children.get(sentIndex), stack, root, semanticGraph, used);
394 //                    }
395 //                }
396 
397                 String originalText = annotation.get(CoreAnnotations.TextAnnotation.class);
398                 writer.append(toBeSimplified ? "1" : "0");
399                 writer.append("\t");
400                 writer.append(originalText.trim());
401                 writer.append("\t");
402 
403 //                SimplificationRule rule;
404 //                String output;
405 
406                 String simplifiedText = originalText;
407 
408                 simplifiedText = complexString(annotation);
409                 simplifiedText = simplifiedText.replaceAll(", eppure", ". Eppure");
410                 simplifiedText = simplifiedText.replaceAll(", tuttavia", ". Tuttavia");
411 
412                 int offset = 0;
413                 for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
414                     Integer sentenceOffset = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
415                     SemanticGraph semanticGraph = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
416                     Collection<IndexedWord> rootNodes = semanticGraph.getRoots();
417                     if (rootNodes.size() != 1) {
418                         continue;
419                     }
420 
421                     List<Action> actions = new ArrayList<>();
422 
423                     IndexedWord rootNode = rootNodes.iterator().next();
424                     List<SemanticGraphEdge> outEdgesSorted = semanticGraph.getOutEdgesSorted(rootNode);
425                     List<IndexedWord> underRoot = new ArrayList<>();
426                     List<Action> tmpActions = null;
427                     for (SemanticGraphEdge semanticGraphEdge : outEdgesSorted) {
428 
429                         IndexedWord dependent = semanticGraphEdge.getDependent();
430                         int depIndex = dependent.index();
431                         String depText = dependent.originalText().toLowerCase();
432 
433                         Integer begin = dependent.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) - sentenceOffset;
434                         Integer end = dependent.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) - sentenceOffset;
435                         if (semanticGraphEdge.getRelation().getShortName().equals("cc")) {
436                             if (depText.equals("e") || depText.equals("ed")) {
437                                 tmpActions = new ArrayList<>();
438                                 tmpActions.add(new Remove(annotation, begin, end));
439                                 tmpActions.add(new Insert(annotation, begin, ". "));
440                                 if (depIndex > 1) {
441                                     CoreLabel previousToken = sentence.get(CoreAnnotations.TokensAnnotation.class).get(depIndex - 2);
442                                     if (previousToken.originalText().equals(",")) {
443                                         Integer pbegin = previousToken.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) - sentenceOffset;
444                                         Integer pend = previousToken.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) - sentenceOffset;
445                                         tmpActions.add(new Remove(annotation, pbegin, pend));
446                                     }
447                                 }
448                             }
449                             if (depText.equals("ma")) {
450                                 tmpActions = new ArrayList<>();
451                                 tmpActions.add(new Remove(annotation, begin, end));
452                                 tmpActions.add(new Insert(annotation, begin, ". PerĂ²"));
453                                 if (depIndex > 1) {
454                                     CoreLabel previousToken = sentence.get(CoreAnnotations.TokensAnnotation.class).get(depIndex - 2);
455                                     if (previousToken.originalText().equals(",")) {
456                                         Integer pbegin = previousToken.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) - sentenceOffset;
457                                         Integer pend = previousToken.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) - sentenceOffset;
458                                         tmpActions.add(new Remove(annotation, pbegin, pend));
459                                     }
460                                 }
461                             }
462                         } else {
463                             if (tmpActions != null && dependent.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("V")) {
464                                 actions.addAll(tmpActions);
465                             }
466                             tmpActions = null;
467                         }
468 //                        System.out.println(semanticGraphEdge);
469                         underRoot.add(dependent);
470                     }
471 
472                     if (actions.size() == 0) {
473                         continue;
474                     }
475 
476                     String text = sentence.get(CoreAnnotations.TextAnnotation.class);
477 //                    String text = sText;
478                     int[] conversionTable = new int[text.length()];
479                     for (int j = 0; j < text.length(); j++) {
480                         conversionTable[j] = j;
481                     }
482                     for (Action action : actions) {
483                         text = action.apply(text, conversionTable);
484                     }
485 
486 //                    if (!text.equals(sText)) {
487 //                        System.out.println(text);
488 //                        System.out.println(sText);
489 //                    }
490 //                    System.out.println();
491 
492                     Integer begin = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
493                     Integer end = sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
494 
495                     StringBuffer stringBuffer = new StringBuffer();
496                     stringBuffer.append(simplifiedText.substring(0, begin));
497                     stringBuffer.append(text);
498                     stringBuffer.append(simplifiedText.substring(end));
499                     simplifiedText = stringBuffer.toString().trim();
500 
501                     offset += text.length() - (end - begin);
502                 }
503 
504                 boolean hasBeenSimplified = false;
505                 if (!simplifiedText.equals(originalText)) {
506                     hasBeenSimplified = true;
507                     totalSimplified++;
508                 }
509 
510                 writer.append(hasBeenSimplified ? "1" : "0");
511                 writer.append("\t");
512                 writer.append(simplifiedText.trim());
513                 writer.append("\n");
514 
515 //                System.out.println(originalText);
516 //                System.out.println(simplifiedText);
517 //                System.out.println();
518 
519 //                rule = new DenominatiSplittingRule();
520 //                output = rule.apply(annotation, children);
521 //
522 //                System.out.println(output);
523 //
524 //                rule = new GarantendoSplittingRule();
525 //                output = rule.apply(annotation, children);
526 //
527 //                System.out.println(output);
528 //
529 //                rule = new GarantendoSplittingRule();
530 //                output = rule.apply(annotation, children);
531 //
532 //                System.out.println(text1);
533 //                System.out.println(simplifiedText);
534 //                System.out.println(text2);
535 //                System.out.println();
536 //
537 //                System.exit(1);
538             }
539 
540             System.out.println(totalSimplified);
541             System.out.println(totalSentences);
542 //            pipeline.run(sentence, System.out, TintRunner.OutputFormat.JSON);
543 
544             writer.close();
545         } catch (Exception e) {
546             e.printStackTrace();
547         }
548     }
549 }