1   package eu.fbk.dh.tint.readability.en;
2   
3   import eu.fbk.utils.core.FrequencyHashSet;
4   import org.slf4j.Logger;
5   import org.slf4j.LoggerFactory;
6   
7   import java.io.BufferedReader;
8   import java.io.InputStream;
9   import java.io.InputStreamReader;
10  import java.util.HashSet;
11  import java.util.Map;
12  import java.util.Properties;
13  import java.util.Set;
14  
15  import static eu.fbk.dh.tint.readability.Readability.getStream;
16  
17  /**
18   * Created by alessio on 26/09/16.
19   */
20  public class EnglishReadabilityModel {
21  
22      private static final Logger LOGGER = LoggerFactory.getLogger(EnglishReadabilityModel.class);
23      private static EnglishReadabilityModel ourInstance = null;
24  
25      private static final int LIMIT_EASY = 1000;
26      private static final int LIMIT_MEDIUM = 5000;
27      private static final int LIMIT_HARD = 10000;
28  
29      private Set<String> level1Lemmas = new HashSet<>();
30      private Set<String> level2Lemmas = new HashSet<>();
31      private Set<String> level3Lemmas = new HashSet<>();
32  
33  //    private HashMap<String, GlossarioEntry> glossario = new HashMap<>();
34  //    private HashMap<Integer, HashMultimap<String, String>> easyWords = new HashMap<>();
35  
36      public static EnglishReadabilityModel getInstance(Properties globalProperties, Properties localProperties) {
37          if (ourInstance == null) {
38              String freqLemmaFile = localProperties.getProperty("lemmasFile");
39  
40              LOGGER.info("Loading lemmas");
41              Set<String> level1Lemmas = new HashSet<>();
42              Set<String> level2Lemmas = new HashSet<>();
43              Set<String> level3Lemmas = new HashSet<>();
44  
45              try {
46  
47                  FrequencyHashSet<String> frequecies = new FrequencyHashSet<>();
48  
49                  InputStream stream = getStream(freqLemmaFile, "/models/0_words.txt");
50                  BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
51                  String line;
52                  while ((line = reader.readLine()) != null) {
53                      String[] parts = line.split("\t");
54                      if (parts.length < 2) {
55                          continue;
56                      }
57  
58                      String form = parts[0];
59                      Integer frequency = Integer.parseInt(parts[1]);
60                      frequecies.add(form, frequency);
61                  }
62  
63                  int i = 0;
64                  for (Map.Entry<String, Integer> entry : frequecies.getSorted()) {
65                      String form = entry.getKey();
66  //                    Integer frequency = entry.getValue();
67  
68                      boolean done = false;
69                      if (i < LIMIT_EASY) {
70                          level1Lemmas.add(form);
71                          done = true;
72                      }
73                      if (i < LIMIT_MEDIUM) {
74                          level2Lemmas.add(form);
75                          done = true;
76                      }
77                      if (i < LIMIT_HARD) {
78                          level3Lemmas.add(form);
79                          done = true;
80                      }
81  
82                      if (!done) {
83                          break;
84                      }
85  
86                      i++;
87                  }
88                  reader.close();
89              } catch (Exception e) {
90                  LOGGER.warn("Unable to load easyWords file: {}", e.getMessage());
91              }
92  
93              ourInstance = new EnglishReadabilityModel(level1Lemmas, level2Lemmas, level3Lemmas);
94          } else {
95              LOGGER.info("Readability model already loaded");
96          }
97          return ourInstance;
98      }
99  
100     private EnglishReadabilityModel(Set<String> level1Lemmas, Set<String> level2Lemmas,
101             Set<String> level3Lemmas) {
102         this.level1Lemmas = level1Lemmas;
103         this.level2Lemmas = level2Lemmas;
104         this.level3Lemmas = level3Lemmas;
105     }
106 
107     public Set<String> getLevel1Lemmas() {
108         return level1Lemmas;
109     }
110 
111     public Set<String> getLevel2Lemmas() {
112         return level2Lemmas;
113     }
114 
115     public Set<String> getLevel3Lemmas() {
116         return level3Lemmas;
117     }
118 }