1   package eu.fbk.dh.tint.readability.it;
2   
3   import com.google.common.collect.HashMultimap;
4   import com.itextpdf.layout.hyphenation.Hyphenator;
5   import edu.stanford.nlp.ling.CoreAnnotations;
6   import edu.stanford.nlp.ling.CoreLabel;
7   import edu.stanford.nlp.pipeline.Annotation;
8   import edu.stanford.nlp.util.CoreMap;
9   import eu.fbk.dh.tint.readability.DescriptionForm;
10  import eu.fbk.dh.tint.readability.GlossarioEntry;
11  import eu.fbk.dh.tint.readability.Readability;
12  import eu.fbk.dh.tint.readability.ReadabilityAnnotations;
13  import eu.fbk.utils.gson.JSONExclude;
14  
15  import java.util.*;
16  
17  /**
18   * Created by alessio on 21/09/16.
19   */
20  
21  public abstract class ItalianReadability extends Readability {
22  
23      @JSONExclude ItalianReadabilityModel model;
24      @JSONExclude int level1WordSize = 0, level2WordSize = 0, level3WordSize = 0;
25  
26      @JSONExclude StringBuilder lemmaBuffer = new StringBuilder();
27      @JSONExclude StringBuilder tokenBuffer = new StringBuilder();
28      @JSONExclude int lemmaIndex = 0;
29      @JSONExclude HashMap<Integer, Integer> lemmaIndexes = new HashMap<>();
30      @JSONExclude HashMap<Integer, Integer> tokenIndexes = new HashMap<>();
31  
32      @Override public void finalizeReadability() {
33          super.finalizeReadability();
34  
35          double gulpease = 89 + (300 * getSentenceCount() - 10 * getDocLenLettersOnly()) / (getWordCount() * 1.0);
36          labels.put("main", "Gulpease");
37          measures.put("main", gulpease);
38          measures.put("level1", 100.0 * level1WordSize / getContentEasyWordSize());
39          measures.put("level2", 100.0 * level2WordSize / getContentWordSize());
40          measures.put("level3", 100.0 * level3WordSize / getContentWordSize());
41  
42          String lemmaText = lemmaBuffer.toString().trim().toLowerCase();
43          String tokenText = tokenBuffer.toString().trim().toLowerCase();
44  //        String text = annotation.get(CoreAnnotations.TextAnnotation.class).toLowerCase();
45  
46          HashMap<String, GlossarioEntry> glossario = model.getGlossario();
47  
48          List<String> glossarioKeys = new ArrayList<>(glossario.keySet());
49          Collections.sort(glossarioKeys, new StringLenComparator());
50  
51          for (String form : glossarioKeys) {
52  
53              int numberOfTokens = form.split("\\s+").length;
54              List<Integer> allOccurrences = findAllOccurrences(tokenText, form);
55              List<Integer> allLemmaOccurrences = findAllOccurrences(lemmaText, form);
56  
57              for (Integer occurrence : allOccurrences) {
58                  addDescriptionForm(form, tokenIndexes, occurrence, numberOfTokens, forms, annotation, glossario);
59              }
60              for (Integer occurrence : allLemmaOccurrences) {
61                  addDescriptionForm(form, lemmaIndexes, occurrence, numberOfTokens, forms, annotation, glossario);
62              }
63          }
64  
65      }
66  
67      public ItalianReadability(Properties globalProperties, Properties localProperties, Annotation annotation) {
68          super("it", annotation, localProperties);
69          hyphenator = new Hyphenator("it", "it", 1, 1);
70          model = ItalianReadabilityModel.getInstance(globalProperties, localProperties);
71  
72          minYellowValues.put("propositionsAvg", 2.038);
73          maxYellowValues.put("propositionsAvg", 2.699);
74          minValues.put("propositionsAvg", 0.0);
75          maxValues.put("propositionsAvg", 5.0);
76  
77          minYellowValues.put("wordsAvg", 9.845);
78          maxYellowValues.put("wordsAvg", 10.153);
79          minValues.put("wordsAvg", 0.0);
80          maxValues.put("wordsAvg", 12.0);
81  
82  //        minYellowValues.put("coordinateRatio", 0.737);
83  //        maxYellowValues.put("coordinateRatio", 0.675);
84  //        minValues.put("coordinateRatio", 0.0);
85  //        maxValues.put("coordinateRatio", 1.0);
86  
87          minYellowValues.put("subordinateRatio", 0.263);
88          maxYellowValues.put("subordinateRatio", 0.325);
89          minValues.put("subordinateRatio", 0.0);
90          maxValues.put("subordinateRatio", 1.0);
91  
92          minYellowValues.put("deepAvg", 5.292);
93          maxYellowValues.put("deepAvg", 6.532);
94          minValues.put("deepAvg", 0.0);
95          maxValues.put("deepAvg", 10.0);
96  
97          minYellowValues.put("ttrValue", 0.549);
98          maxYellowValues.put("ttrValue", 0.719);
99          minValues.put("ttrValue", 0.0);
100         maxValues.put("ttrValue", 1.0);
101 
102         minYellowValues.put("density", 0.566);
103         maxYellowValues.put("density", 0.566);
104         minValues.put("density", 0.0);
105         maxValues.put("density", 1.0);
106     }
107 
108     public static class StringLenComparator implements Comparator<String> {
109 
110         public int compare(String s1, String s2) {
111             return s1.length() - s2.length();
112         }
113     }
114 
115     public static List<Integer> findAllOccurrences(String haystack, String needle) {
116 
117         List<Integer> ret = new ArrayList<>();
118 
119         int index = haystack.indexOf(needle);
120         while (index >= 0) {
121             try {
122                 String afterChar = haystack.substring(index + needle.length(), index + needle.length());
123                 if (!afterChar.matches("\\w+")) {
124                     ret.add(index);
125                 }
126             } catch (Exception e) {
127                 // ignore
128             }
129             index = haystack.indexOf(needle, index + 1);
130         }
131 
132         return ret;
133     }
134 
135     static public void addDescriptionForm(String form, HashMap<Integer, Integer> indexes, int start,
136             int numberOfTokens, TreeMap<Integer, DescriptionForm> forms, Annotation annotation,
137             HashMap<String, GlossarioEntry> glossario) {
138         Integer lemmaIndex = indexes.get(start);
139         if (lemmaIndex == null) {
140             return;
141         }
142 
143         CoreLabel firstToken = annotation.get(CoreAnnotations.TokensAnnotation.class).get(lemmaIndex);
144         CoreLabel endToken = annotation.get(CoreAnnotations.TokensAnnotation.class)
145                 .get(lemmaIndex + numberOfTokens - 1);
146         Integer beginOffset = firstToken.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
147         Integer endOffset = endToken.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
148 
149         GlossarioEntry glossarioEntry = glossario.get(form);
150         if (glossarioEntry == null) {
151             return;
152         }
153 
154         DescriptionForm descriptionForm = new DescriptionForm(
155                 beginOffset, endOffset, glossarioEntry);
156 
157         forms.put(beginOffset, descriptionForm);
158     }
159 
160     @Override public void addingContentWord(CoreLabel token) {
161         super.addingContentWord(token);
162         HashMap<Integer, HashMultimap<String, String>> easyWords = model.getEasyWords();
163         String simplePos = getGenericPos(token.get(CoreAnnotations.PartOfSpeechAnnotation.class));
164         String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
165 
166         token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 4);
167 
168         if (easyWords.get(3).get(simplePos).contains(lemma)) {
169             level3WordSize++;
170             token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 3);
171         }
172         if (easyWords.get(2).get(simplePos).contains(lemma)) {
173             level2WordSize++;
174             token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 2);
175         }
176         if (easyWords.get(1).get(simplePos).contains(lemma)) {
177             level1WordSize++;
178             token.set(ReadabilityAnnotations.DifficultyLevelAnnotation.class, 1);
179         }
180     }
181 
182     @Override public void addingEasyWord(CoreLabel token) {
183 
184     }
185 
186     @Override public void addingWord(CoreLabel token) {
187         super.addingWord(token);
188     }
189 
190     @Override public void addingToken(CoreLabel token) {
191         lemmaIndexes.put(lemmaBuffer.length(), lemmaIndex);
192         tokenIndexes.put(token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), lemmaIndex);
193         lemmaIndex++;
194         lemmaBuffer.append(token.get(CoreAnnotations.LemmaAnnotation.class)).append(" ");
195         tokenBuffer.append(token.get(CoreAnnotations.TextAnnotation.class)).append(" ");
196     }
197 
198     @Override public void addingSentence(CoreMap sentence) {
199 
200     }
201 }