1   package eu.fbk.dh.tint.readability;
2   
3   import com.itextpdf.layout.hyphenation.Hyphenation;
4   import com.itextpdf.layout.hyphenation.Hyphenator;
5   import edu.stanford.nlp.ling.CoreAnnotations;
6   import edu.stanford.nlp.ling.CoreLabel;
7   import edu.stanford.nlp.ling.IndexedWord;
8   import edu.stanford.nlp.pipeline.Annotation;
9   import edu.stanford.nlp.semgraph.SemanticGraph;
10  import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
11  import edu.stanford.nlp.util.CoreMap;
12  import eu.fbk.dh.tint.readability.es.SpanishReadabilityModel;
13  import eu.fbk.dh.tint.verb.VerbAnnotations;
14  import eu.fbk.dh.tint.verb.VerbMultiToken;
15  import eu.fbk.utils.core.FrequencyHashSet;
16  import eu.fbk.utils.core.PropertiesUtils;
17  import eu.fbk.utils.gson.JSONExclude;
18  
19  import javax.annotation.Nullable;
20  import java.io.File;
21  import java.io.FileInputStream;
22  import java.io.FileNotFoundException;
23  import java.io.InputStream;
24  import java.text.Normalizer;
25  import java.util.*;
26  
27  /**
28   * Created by alessio on 21/09/16.
29   */
30  public abstract class Readability {
31  
32      public static Integer DEFAULT_TTR_LIMIT = 1000;
33      @JSONExclude private int ttrLimit;
34  
35      private String language = null;
36      private int contentWordSize = 0, contentEasyWordSize = 0, wordCount = 0;
37      private int docLenWithSpaces = 0, docLenWithoutSpaces = 0, docLenLettersOnly = 0;
38      private int sentenceCount = 0, tokenCount = 0;
39      private int hyphenCount = 0;
40      private int hyphenWordCount = 0;
41  
42      private Double ttrValue;
43      private Double density;
44      private Double deepAvg;
45      private Double deepMax;
46      private Double propositionsAvg;
47      private Double wordsAvg;
48      //    private Double coordinateRatio;
49      private Double subordinateRatio;
50  
51      protected TreeMap<Integer, DescriptionForm> forms = new TreeMap<>();
52  
53      protected Map<String, Double> measures = new HashMap<>();
54      protected Map<String, String> labels = new HashMap<>();
55  
56      protected Map<String, Double> minYellowValues = new HashMap<>();
57      protected Map<String, Double> maxYellowValues = new HashMap<>();
58      protected Map<String, Double> minValues = new HashMap<>();
59      protected Map<String, Double> maxValues = new HashMap<>();
60  
61      public int getHyphenWordCount() {
62          return hyphenWordCount;
63      }
64  
65      @JSONExclude protected HashSet<String> contentPosList = new HashSet<>();
66      @JSONExclude protected HashSet<String> simplePosList = new HashSet<>();
67      @JSONExclude protected HashSet<String> nonWordPosList = new HashSet<>();
68  
69      protected HashMap<String, String> genericPosDescription = new HashMap<>();
70      protected HashMap<String, String> posDescription = new HashMap<>();
71  
72      @JSONExclude boolean useGenericForContent = true;
73      @JSONExclude boolean useGenericForSimple = true;
74      @JSONExclude boolean useGenericForWord = true;
75  
76      Set<Integer> tooLongSentences = new HashSet<>();
77      FrequencyHashSet<String> posStats = new FrequencyHashSet<>();
78      FrequencyHashSet<String> genericPosStats = new FrequencyHashSet<>();
79  
80      @JSONExclude protected Hyphenator hyphenator;
81      @JSONExclude protected Annotation annotation;
82  
83      public Readability(String language, Annotation annotation, Properties localProperties) {
84          this.language = language;
85          this.annotation = annotation;
86  
87          String text = annotation.get(CoreAnnotations.TextAnnotation.class);
88          docLenWithSpaces = text.length();
89          docLenWithoutSpaces = text.replaceAll("\\s+", "").length();
90          ttrLimit = PropertiesUtils.getInteger(localProperties.getProperty("ttrLimit"), DEFAULT_TTR_LIMIT);
91      }
92  
93      public void finalizeReadability() {
94          Set<String> ttr = new HashSet<>();
95  
96          int i = 0;
97          for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
98              Boolean isWord = token.get(ReadabilityAnnotations.LiteralWord.class);
99              if (!isWord) {
100                 continue;
101             }
102 
103             if (i >= ttrLimit) {
104                 break;
105             }
106             String tokenText = token.originalText().toLowerCase();
107             ttr.add(tokenText);
108             i++;
109         }
110         List<Integer> deeps = new ArrayList<>();
111         List<Integer> propositions = new ArrayList<>();
112 
113         Integer coordinates = 0;
114         Integer subordinates = 0;
115 
116         List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
117         for (int sentIndex = 0; sentIndex < sentences.size(); sentIndex++) {
118             CoreMap sentence = sentences.get(sentIndex);
119 
120             SemanticGraph semanticGraph = sentence
121                     .get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
122             int deep = 0;
123             if (semanticGraph == null) {
124                 continue;
125             }
126 
127             for (IndexedWord indexedWord : semanticGraph.getLeafVertices()) {
128                 try {
129                     deep = Math.max(deep, semanticGraph.getPathToRoot(indexedWord).size());
130                 } catch (NullPointerException e) {
131                     // ignored
132                 }
133             }
134             deeps.add(deep);
135             
136             if (!sentence.containsKey(VerbAnnotations.VerbsAnnotation.class)) {
137                 continue;
138             }
139 
140             List<VerbMultiToken> verbs = sentence.get(VerbAnnotations.VerbsAnnotation.class);
141             propositions.add(verbs.size());
142 
143             Set<Integer> heads = new HashSet<>();
144             for (VerbMultiToken verb : verbs) {
145                 Map<Integer, String> parentIDs = SemanticGraphUtils.getParent(verb, semanticGraph);
146                 Integer head = SemanticGraphUtils.getHead(verb, semanticGraph);
147                 heads.add(head);
148 //                indexedVerbs.put(head, verb);
149 
150                 if (parentIDs.size() == 0) {
151                     continue;
152                 }
153 
154                 if (parentIDs.values().contains("conj")) {
155                     coordinates++;
156                     continue;
157                 }
158 
159                 subordinates++;
160             }
161         }
162 
163         ttrValue = 1.0 * ttr.size() / (1.0 * i);
164         if (deeps.size() > 0) {
165             deepAvg = deeps.stream().mapToInt(val -> val).average().getAsDouble();
166             deepMax = deeps.stream().mapToInt(val -> val).max().getAsInt() * 1.0;
167         }
168         if (propositions.size() > 0) {
169             propositionsAvg = propositions.stream().mapToInt(val -> val).average().getAsDouble();
170             wordsAvg = (1.0 * getWordCount()) / propositions.stream().mapToInt(val -> val).sum();
171             if (wordsAvg == Double.POSITIVE_INFINITY) {
172                 wordsAvg = 0.0;
173             }
174         }
175 
176         int total = coordinates + subordinates;
177         if (total == 0) {
178 //            coordinateRatio = 0.0;
179             subordinateRatio = 0.0;
180         } else {
181 //            coordinateRatio = (1.0 * coordinates) / (coordinates + subordinates);
182             subordinateRatio = (1.0 * subordinates) / (coordinates + subordinates);
183         }
184         density = (1.0 * getContentWordSize()) / getWordCount();
185 
186 //        System.out.println("Average deep: " + deepAvg);
187 //        System.out.println("Average propositions: " + propositionsAvg);
188 //        System.out.println("Average words per proposition: " + wordsAvg);
189 //        System.out.println(String.format("Coordinates: %d (%.2f%%)", coordinates, coordinateRatio));
190 //        System.out.println("Coordinates: " + coordinates);
191 //        System.out.println("Subordinates: " + subordinates);
192 //        System.out.println(String.format("Subordinates: %d (%.2f%%)", subordinates, subordinateRatio));
193 //        System.out.println("TTR: " + ttrValue);
194     }
195 
196     public Map<String, Double> getMeasures() {
197         return measures;
198     }
199 
200     public void addingContentWord(CoreLabel token) {
201         token.set(ReadabilityAnnotations.ContentWord.class, true);
202     }
203 
204     public abstract void addingEasyWord(CoreLabel token);
205 
206     public void addingWord(CoreLabel token) {
207         token.set(ReadabilityAnnotations.LiteralWord.class, true);
208     }
209 
210     public abstract void addingToken(CoreLabel token);
211 
212     public abstract void addingSentence(CoreMap sentence);
213 
214     public void addTooLongSentence(Integer sentenceID) {
215         tooLongSentences.add(sentenceID);
216     }
217 
218     public Set<Integer> getTooLongSentences() {
219         return tooLongSentences;
220     }
221 
222     public String getLanguage() {
223         return language;
224     }
225 
226     public int getContentWordSize() {
227         return contentWordSize;
228     }
229 
230     public void setContentWordSize(int contentWordSize) {
231         this.contentWordSize = contentWordSize;
232     }
233 
234     public int getContentEasyWordSize() {
235         return contentEasyWordSize;
236     }
237 
238     public void setContentEasyWordSize(int contentEasyWordSize) {
239         this.contentEasyWordSize = contentEasyWordSize;
240     }
241 
242     public int getWordCount() {
243         return wordCount;
244     }
245 
246     public void setWordCount(int wordCount) {
247         this.wordCount = wordCount;
248     }
249 
250     public int getDocLenWithSpaces() {
251         return docLenWithSpaces;
252     }
253 
254     public void setDocLenWithSpaces(int docLenWithSpaces) {
255         this.docLenWithSpaces = docLenWithSpaces;
256     }
257 
258     public int getDocLenWithoutSpaces() {
259         return docLenWithoutSpaces;
260     }
261 
262     public void setDocLenWithoutSpaces(int docLenWithoutSpaces) {
263         this.docLenWithoutSpaces = docLenWithoutSpaces;
264     }
265 
266     public int getDocLenLettersOnly() {
267         return docLenLettersOnly;
268     }
269 
270     public void setDocLenLettersOnly(int docLenLettersOnly) {
271         this.docLenLettersOnly = docLenLettersOnly;
272     }
273 
274     public int getSentenceCount() {
275         return sentenceCount;
276     }
277 
278     public void setSentenceCount(int sentenceCount) {
279         this.sentenceCount = sentenceCount;
280     }
281 
282     public int getTokenCount() {
283         return tokenCount;
284     }
285 
286     public void setTokenCount(int tokenCount) {
287         this.tokenCount = tokenCount;
288     }
289 
290     public FrequencyHashSet<String> getPosStats() {
291         return posStats;
292     }
293 
294     public FrequencyHashSet<String> getGenericPosStats() {
295         return genericPosStats;
296     }
297 
298     public String getTransformedPos(String pos) {
299         return pos;
300     }
301 
302     public int getHyphenCount() {
303         return hyphenCount;
304     }
305 
306     public void setHyphenCount(int hyphenCount) {
307         this.hyphenCount = hyphenCount;
308     }
309 
310     public void incrementHyphenCount(int increment) {
311         this.hyphenCount += increment;
312     }
313 
314     // thanks! http://stackoverflow.com/questions/3322152/is-there-a-way-to-get-rid-of-accents-and-convert-a-whole-string-to-regular-lette
315     public static String flattenToAscii(String string) {
316         StringBuilder sb = new StringBuilder(string.length());
317         string = Normalizer.normalize(string, Normalizer.Form.NFD);
318         for (char c : string.toCharArray()) {
319             if (c <= '\u007F') {
320                 sb.append(c);
321             }
322         }
323         return sb.toString();
324     }
325 
326     public void addWord(CoreLabel token) {
327         token.set(ReadabilityAnnotations.ContentWord.class, false);
328         token.set(ReadabilityAnnotations.LiteralWord.class, false);
329 
330         String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
331 //        String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
332         String word = token.word();
333 
334         addingToken(token);
335 
336         if (isWordPos(pos)) {
337             addingWord(token);
338             wordCount++;
339             docLenLettersOnly += token.endPosition() - token.beginPosition();
340 
341             word = flattenToAscii(word);
342             Hyphenation hyphenation = hyphenator.hyphenate(word);
343 
344             boolean done = false;
345             if (hyphenation != null) {
346                 try {
347                     String h = hyphenation.toString();
348                     incrementHyphenCount(hyphenation.length() + 1);
349                     token.set(ReadabilityAnnotations.HyphenationAnnotation.class, h);
350                     done = true;
351                     hyphenWordCount++;
352                 } catch (Exception e) {
353                     // ignored
354                 }
355             }
356 
357             if (!done && word.length() < 5) {
358                 incrementHyphenCount(1);
359                 hyphenWordCount++;
360             }
361 
362             if (isContentPos(pos)) {
363                 contentWordSize++;
364                 addingContentWord(token);
365             }
366             if (isEasyPos(pos)) {
367                 contentEasyWordSize++;
368                 addingEasyWord(token);
369             }
370         }
371         if (token.get(ReadabilityAnnotations.HyphenationAnnotation.class) == null) {
372             token.set(ReadabilityAnnotations.HyphenationAnnotation.class, token.originalText());
373         }
374 
375         String genericPos = getGenericPos(pos);
376         posStats.add(pos);
377         genericPosStats.add(genericPos);
378     }
379 
380     protected String getGenericPos(String pos) {
381         return pos.substring(0, 1);
382     }
383 
384     protected boolean getGenericPosInfo(boolean constraint, Set<String> setToCheck, String pos, boolean reverse) {
385         if (constraint) {
386             pos = getGenericPos(pos);
387         }
388         boolean ret = setToCheck.contains(pos);
389         if (reverse) {
390             return !ret;
391         } else {
392             return ret;
393         }
394     }
395 
396     public static InputStream getStream(String fileName, @Nullable String defaultFileName)
397             throws FileNotFoundException {
398         if (fileName != null) {
399             File streamFile = new File(fileName);
400             if (streamFile.exists()) {
401                 return new FileInputStream(streamFile);
402             }
403         }
404         InputStream input = SpanishReadabilityModel.class.getResourceAsStream(defaultFileName);
405         if (input != null) {
406             return input;
407         }
408 
409         if (defaultFileName != null) {
410             return getStream(defaultFileName, null);
411         }
412         return null;
413     }
414 
415     protected boolean isWordPos(String pos) {
416         return getGenericPosInfo(useGenericForWord, nonWordPosList, pos, true);
417     }
418 
419     protected boolean isContentPos(String pos) {
420         return getGenericPosInfo(useGenericForContent, contentPosList, pos, false);
421     }
422 
423     protected boolean isEasyPos(String pos) {
424         return getGenericPosInfo(useGenericForSimple, simplePosList, pos, false);
425     }
426 
427     public Double getTtrValue() {
428         return ttrValue;
429     }
430 
431     public Double getDeepAvg() {
432         return deepAvg;
433     }
434 
435     public Double getDeepMax() {
436         return deepMax;
437     }
438 
439     public Double getPropositionsAvg() {
440         return propositionsAvg;
441     }
442 
443     public Double getWordsAvg() {
444         return wordsAvg;
445     }
446 
447 //    public Double getCoordinateRatio() {
448 //        return coordinateRatio;
449 //    }
450 
451     public Double getSubordinateRatio() {
452         return subordinateRatio;
453     }
454 
455     public Double getDensity() {
456         return density;
457     }
458 
459     @Override public String toString() {
460         return "Readability{" +
461                 "language='" + language + '\'' +
462                 ", contentWordSize=" + contentWordSize +
463                 ", contentEasyWordSize=" + contentEasyWordSize +
464                 ", wordCount=" + wordCount +
465                 ", docLenWithSpaces=" + docLenWithSpaces +
466                 ", docLenWithoutSpaces=" + docLenWithoutSpaces +
467                 ", docLenLettersOnly=" + docLenLettersOnly +
468                 ", sentenceCount=" + sentenceCount +
469                 ", tokenCount=" + tokenCount +
470                 ", hyphenCount=" + hyphenCount +
471                 ", hyphenWordCount=" + hyphenWordCount +
472                 ", ttrValue=" + ttrValue +
473                 ", deepAvg=" + deepAvg +
474                 ", deepMax=" + deepMax +
475                 ", propositionsAvg=" + propositionsAvg +
476                 ", wordsAvg=" + wordsAvg +
477                 ", subordinateRatio=" + subordinateRatio +
478                 ", measures=" + measures +
479                 ", contentPosList=" + contentPosList +
480                 ", simplePosList=" + simplePosList +
481                 ", nonWordPosList=" + nonWordPosList +
482                 ", tooLongSentences=" + tooLongSentences +
483                 ", posStats=" + posStats +
484                 ", genericPosStats=" + genericPosStats +
485                 '}';
486     }
487 }