1   package eu.fbk.dh.tint.readability.en;
2   
3   import com.itextpdf.layout.hyphenation.Hyphenator;
4   import edu.stanford.nlp.ling.CoreLabel;
5   import edu.stanford.nlp.pipeline.Annotation;
6   import edu.stanford.nlp.util.CoreMap;
7   import eu.fbk.dh.tint.readability.Readability;
8   import eu.fbk.utils.gson.JSONExclude;
9   
10  import java.util.Properties;
11  
12  /**
13   * Created by alessio on 21/09/16.
14   */
15  
16  abstract class EnglishReadability extends Readability {
17  
18      @JSONExclude EnglishReadabilityModel model;
19      @JSONExclude int level1WordSize = 0, level2WordSize = 0, level3WordSize = 0;
20  //
21  //    @JSONExclude StringBuilder buffer = new StringBuilder();
22  //    @JSONExclude int lemmaIndex = 0;
23  //    @JSONExclude HashMap<Integer, Integer> lemmaIndexes = new HashMap<>();
24  //    @JSONExclude HashMap<Integer, Integer> tokenIndexes = new HashMap<>();
25  //    TreeMap<Integer, DescriptionForm> forms = new TreeMap<>();
26  
27      @Override public void finalizeReadability() {
28          super.finalizeReadability();
29  
30  
31          double flesch = 206.835 - (84.6 * getHyphenCount() / getHyphenWordCount()) - (1.015 * getWordCount()
32                  / getSentenceCount());
33          double fleschKincaid =
34                  (0.39 * getWordCount() / getSentenceCount()) + (11.8 * getHyphenCount() / getHyphenWordCount()) - 15.59;
35          labels.put("main", "Flesch");
36          measures.put("main", flesch);
37          measures.put("flesch-kincaid", fleschKincaid);
38          measures.put("level1", 100.0 * level1WordSize / getContentWordSize());
39          measures.put("level2", 100.0 * level2WordSize / getContentWordSize());
40          measures.put("level3", 100.0 * level3WordSize / getContentWordSize());
41  //
42  //        String lemmaText = buffer.toString().trim();
43  //        String text = annotation.get(CoreAnnotations.TextAnnotation.class);
44  //
45  //        HashMap<String, GlossarioEntry> glossario = model.getGlossario();
46  //
47  //        List<String> glossarioKeys = new ArrayList<>(glossario.keySet());
48  //        Collections.sort(glossarioKeys, new StringLenComparator());
49  //
50  //        for (String form : glossarioKeys) {
51  //
52  //            int numberOfTokens = form.split("\\s+").length;
53  //            List<Integer> allOccurrences = findAllOccurrences(text, form);
54  //            List<Integer> allLemmaOccurrences = findAllOccurrences(lemmaText, form);
55  //
56  //            for (Integer occurrence : allOccurrences) {
57  //                addDescriptionForm(form, tokenIndexes, occurrence, numberOfTokens, forms, annotation, glossario);
58  //            }
59  //            for (Integer occurrence : allLemmaOccurrences) {
60  //                addDescriptionForm(form, lemmaIndexes, occurrence, numberOfTokens, forms, annotation, glossario);
61  //            }
62  //        }
63  
64      }
65  
66      public EnglishReadability(Properties globalProperties, Properties localProperties, Annotation annotation) {
67          super("en", annotation, localProperties);
68          hyphenator = new Hyphenator("en", "en", 1, 1);
69          model = EnglishReadabilityModel.getInstance(globalProperties, localProperties);
70  
71  //        System.out.println(model.getLevel1Lemmas().size());
72  //        System.out.println(model.getLevel2Lemmas().size());
73  //        System.out.println(model.getLevel3Lemmas().size());
74  
75  //        minYellowValues.put("propositionsAvg", 2.038);
76  //        maxYellowValues.put("propositionsAvg", 2.699);
77  //        minValues.put("propositionsAvg", 0.0);
78  //        maxValues.put("propositionsAvg", 5.0);
79  //
80  //        minYellowValues.put("wordsAvg", 9.845);
81  //        maxYellowValues.put("wordsAvg", 10.153);
82  //        minValues.put("wordsAvg", 0.0);
83  //        maxValues.put("wordsAvg", 12.0);
84  
85  //        minYellowValues.put("coordinateRatio", 0.737);
86  //        maxYellowValues.put("coordinateRatio", 0.675);
87  //        minValues.put("coordinateRatio", 0.0);
88  //        maxValues.put("coordinateRatio", 1.0);
89  
90  //        minYellowValues.put("subordinateRatio", 0.263);
91  //        maxYellowValues.put("subordinateRatio", 0.325);
92  //        minValues.put("subordinateRatio", 0.0);
93  //        maxValues.put("subordinateRatio", 1.0);
94  
95          minYellowValues.put("deepAvg", 5.292);
96          maxYellowValues.put("deepAvg", 6.532);
97          minValues.put("deepAvg", 0.0);
98          maxValues.put("deepAvg", 10.0);
99  
100         minYellowValues.put("deepMax", 9.0);
101         maxYellowValues.put("deepMax", 12.0);
102         minValues.put("deepMax", 0.0);
103         maxValues.put("deepMax", 20.0);
104 
105         minYellowValues.put("ttrValue", 0.549);
106         maxYellowValues.put("ttrValue", 0.719);
107         minValues.put("ttrValue", 0.0);
108         maxValues.put("ttrValue", 1.0);
109 
110         minYellowValues.put("density", 0.566);
111         maxYellowValues.put("density", 0.566);
112         minValues.put("density", 0.0);
113         maxValues.put("density", 1.0);
114     }
115 
116     @Override public void addingContentWord(CoreLabel token) {
117         super.addingContentWord(token);
118 
119         String lemma = token.word();
120         if (model.getLevel1Lemmas().contains(lemma)) {
121             level1WordSize++;
122         }
123         if (model.getLevel2Lemmas().contains(lemma)) {
124             level2WordSize++;
125         }
126         if (model.getLevel3Lemmas().contains(lemma)) {
127             level3WordSize++;
128         }
129 //        System.out.println("Adding content word (lemma): " + lemma);
130 //        System.out.println(model.getLevel1Lemmas().contains(lemma));
131 //        System.out.println(model.getLevel2Lemmas().contains(lemma));
132 //        System.out.println(model.getLevel3Lemmas().contains(lemma));
133 //        System.out.println();
134 
135 //        HashMap<Integer, HashMultimap<String, String>> easyWords = model.getEasyWords();
136 //        String simplePos = getGenericPos(token.get(CoreAnnotations.PartOfSpeechAnnotation.class));
137 //        String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
138 //
139 //        if (easyWords.get(1).get(simplePos).contains(lemma)) {
140 //            level1WordSize++;
141 //        }
142 //        if (easyWords.get(2).get(simplePos).contains(lemma)) {
143 //            level2WordSize++;
144 //        }
145 //        if (easyWords.get(3).get(simplePos).contains(lemma)) {
146 //            level3WordSize++;
147 //        }
148     }
149 
150     @Override public void addingEasyWord(CoreLabel token) {
151 
152     }
153 
154     @Override public void addingWord(CoreLabel token) {
155         super.addingWord(token);
156     }
157 
158     @Override public void addingToken(CoreLabel token) {
159 //        lemmaIndexes.put(buffer.length(), lemmaIndex);
160 //        tokenIndexes.put(token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), lemmaIndex);
161 //        lemmaIndex++;
162 //        buffer.append(token.get(CoreAnnotations.LemmaAnnotation.class)).append(" ");
163     }
164 
165     @Override public void addingSentence(CoreMap sentence) {
166 
167     }
168 }