1   package eu.fbk.dh.tint.simplifier.rules;
2   
3   import com.google.common.collect.HashMultimap;
4   import edu.stanford.nlp.ling.CoreAnnotations;
5   import edu.stanford.nlp.ling.CoreLabel;
6   import edu.stanford.nlp.pipeline.Annotation;
7   import edu.stanford.nlp.util.CoreMap;
8   
9   import java.util.*;
10  import java.util.regex.Matcher;
11  import java.util.regex.Pattern;
12  
13  /**
14   * Created by alessio on 15/02/17.
15   */
16  
17  public abstract class SimpleSplittingRule implements SimplificationRule {
18  
19      private List<String> words = new ArrayList<>();
20      private List<Pattern> patterns = new ArrayList<>();
21      private Map<Integer, String> replacements = new HashMap<>();
22      private int head = 1;
23      private boolean useRegex = false;
24  
25      private static Pattern replacementPattern = Pattern.compile("\\$([0-9]+)");
26  
27  //    public SimpleSplittingRule(List<String> words, Map<Integer, String> replacements, int head) {
28  //        this.words = words;
29  //        this.replacements = replacements;
30  //        this.head = head;
31  //    }
32  
33      public void setWords(List<String> words) {
34          this.words = words;
35      }
36  
37      public void setReplacements(Map<Integer, String> replacements) {
38          this.replacements = replacements;
39      }
40  
41      public void setHead(int head) {
42          this.head = head;
43      }
44  
45      public void setUseRegex(boolean useRegex) {
46          this.useRegex = useRegex;
47      }
48      //    public static void main(String[] args) {
49  //        words.add(",");
50  //        words.add("precisando");
51  //        words.add("che");
52  //        replacements.put(0, "");
53  //        replacements.put(1, "Si precisa");
54  //
55  //    }
56  
57      @Override public String apply(Annotation annotation, Map<Integer, HashMultimap<Integer, Integer>> children) {
58  
59          if (useRegex) {
60              for (String word : words) {
61                  patterns.add(Pattern.compile(word));
62              }
63          }
64  
65          StringBuffer ret = new StringBuffer();
66  
67          List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
68          for (int sentIndex = 0; sentIndex < sentences.size(); sentIndex++) {
69              CoreMap sentence = sentences.get(sentIndex);
70  
71              List<Matcher> matchers = new ArrayList<>();
72              List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
73              Integer foundHead = null;
74              for (int i = 0; i < tokens.size() - (words.size() - 1); i++) {
75  
76                  matchers = new ArrayList<>();
77  
78                  boolean equals = true;
79                  for (int j = 0; j < words.size(); j++) {
80                      CoreLabel token = tokens.get(i + j);
81                      if (useRegex) {
82                          matchers.add(patterns.get(j).matcher(token.originalText().toLowerCase()));
83                      } else if (!token.originalText().toLowerCase().equals(words.get(j))) {
84                          equals = false;
85                      }
86                  }
87  
88                  if (useRegex) {
89                      for (Matcher matcher : matchers) {
90                          if (!matcher.find()) {
91                              equals = false;
92                          }
93                      }
94  
95                  }
96  
97                  if (equals) {
98                      foundHead = i + head;
99  //                    System.out.println("Beccato! " + tokens.get(foundHead));
100                     foundHead++; // indexes start from 1
101                     break;
102                 }
103             }
104 
105             if (foundHead != null) {
106                 List<String> groups = new ArrayList<>();
107                 if (useRegex) {
108                     for (Matcher matcher : matchers) {
109                         for (int i = 0; i < matcher.groupCount(); i++) {
110                             groups.add(matcher.group(i + 1));
111                         }
112                     }
113                 }
114 
115                 StringBuffer oldSentence = new StringBuffer();
116                 StringBuffer newSentence = new StringBuffer();
117 
118 //                    Set<Integer> tokensToTheOldSentence = new HashSet<>();
119                 Set<Integer> tokensToTheNewSentence = new HashSet<>();
120 //                    Set<Integer> partsToTheOldSentence = new HashSet<>();
121 //                    Set<Integer> partsToTheNewSentence = new HashSet<>();
122 
123                 tokensToTheNewSentence.add(foundHead);
124                 tokensToTheNewSentence.addAll(children.get(sentIndex).get(foundHead));
125 
126 //                System.out.println(foundHead);
127 //                System.out.println(tokensToTheNewSentence);
128 
129 //                    for (int i = 0; i < replacements.size(); i++) {
130 //                        int thisID = foundHead - head + i;
131 //                        if (tokensToTheNewSentence.contains(thisID)) {
132 //                            partsToTheNewSentence.add()
133 //                        }
134 //                    }
135 
136                 for (int i = 0; i < tokens.size(); i++) {
137                     CoreLabel token = tokens.get(i);
138 
139 //                    int spaces = 0;
140 //                    if (i != tokens.size() - 1) {
141 //                        CoreLabel nextToken = tokens.get(i + 1);
142 //                        Integer begin = nextToken.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
143 //                        Integer end = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
144 //                        spaces = begin - end;
145 //                        System.out.println("N: " + token + " -- " + nextToken + " -- " + (begin - end));
146 //                    }
147 //                    if (i != 0) {
148 //                        CoreLabel prevToken = tokens.get(i - 1);
149 //                        Integer begin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
150 //                        Integer end = prevToken.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
151 //                        spaces = Math.max(spaces, begin - end);
152 //                        System.out.println("P: " + prevToken + " -- " + token + " -- " + (begin - end));
153 //                    }
154                     String toAppend = token.originalText();
155                     int relativeID = i - foundHead + head + 1;
156                     String replacement = replacements.get(relativeID);
157                     if (useRegex && replacement != null) {
158                         Matcher matcher = replacementPattern.matcher(replacement);
159                         StringBuffer sb = new StringBuffer(replacement.length());
160                         while (matcher.find()) {
161 //                            String text = matcher.group(1);
162                             String text = groups.get(Integer.parseInt(matcher.group(1)) - 1);
163                             matcher.appendReplacement(sb, Matcher.quoteReplacement(text));
164                         }
165                         matcher.appendTail(sb);
166                         replacement = sb.toString();
167                     }
168                     if (replacement != null) {
169                         toAppend = replacement;
170                     }
171                     if (toAppend.length() > 0) {
172                         if (tokensToTheNewSentence.contains(i + 1)) {
173                             newSentence.append(toAppend);
174 //                            for (int j = 0; j < spaces; j++) {
175                             newSentence.append(" ");
176 //                            }
177                         } else {
178                             oldSentence.append(toAppend);
179 //                            for (int j = 0; j < spaces; j++) {
180                             oldSentence.append(" ");
181 //                            }
182                         }
183                     }
184                 }
185 
186 //                System.out.println(oldSentence);
187 //                System.out.println(newSentence);
188 
189                 ret.append(oldSentence).append("\n");
190                 ret.append(newSentence).append("\n");
191 
192             } else {
193                 ret.append(sentence.get(CoreAnnotations.TextAnnotation.class).trim()).append("\n");
194             }
195 
196         }
197 
198         return ret.toString();
199     }
200 }