1   package eu.fbk.dh.tint.readability.gl;
2   
3   import org.slf4j.Logger;
4   import org.slf4j.LoggerFactory;
5   
6   import java.io.BufferedReader;
7   import java.io.InputStream;
8   import java.io.InputStreamReader;
9   import java.util.HashSet;
10  import java.util.Properties;
11  import java.util.Set;
12  import java.util.regex.Matcher;
13  import java.util.regex.Pattern;
14  
15  import static eu.fbk.dh.tint.readability.Readability.getStream;
16  
17  /**
18   * Created by alessio on 26/09/16.
19   */
20  public class GalicianReadabilityModel {
21  
22      private static final Logger LOGGER = LoggerFactory.getLogger(GalicianReadabilityModel.class);
23      private static GalicianReadabilityModel ourInstance = null;
24      private static Pattern POS_PATTERN = Pattern.compile("(.*)_([A-Z+]+)");
25      private static Set<String> allowedPos = new HashSet<>();
26      static {
27          allowedPos.add("ADV");
28          allowedPos.add("VERB");
29          allowedPos.add("NOUN");
30          allowedPos.add("ADJ");
31      }
32  
33      private static final int LIMIT_EASY = 500;
34      private static final int LIMIT_MEDIUM = 2500;
35      private static final int LIMIT_HARD = 5000;
36  
37      private Set<String> level1Lemmas = new HashSet<>();
38      private Set<String> level2Lemmas = new HashSet<>();
39      private Set<String> level3Lemmas = new HashSet<>();
40  
41      public static void main(String[] args) {
42          GalicianReadabilityModel instance = GalicianReadabilityModel.getInstance(new Properties(), new Properties());
43          System.out.println(instance.level1Lemmas);
44      }
45  
46      public static GalicianReadabilityModel getInstance(Properties globalProperties, Properties localProperties) {
47          if (ourInstance == null) {
48              String freqLemmaFile = localProperties.getProperty("lemmasFile");
49  
50              LOGGER.info("Loading lemmas");
51              Set<String> level1Lemmas = new HashSet<>();
52              Set<String> level2Lemmas = new HashSet<>();
53              Set<String> level3Lemmas = new HashSet<>();
54  
55              try {
56                  InputStream stream = getStream(freqLemmaFile, "/models/stats_treegal.txt");
57                  BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
58                  String line;
59                  int i = 0;
60                  while ((line = reader.readLine()) != null) {
61  
62                      line = line.trim();
63  
64                      String[] parts = line.split("\\s+");
65                      if (parts.length < 2) {
66                          continue;
67                      }
68  
69                      String lemmaPos = parts[1];
70                      Matcher matcher = POS_PATTERN.matcher(lemmaPos);
71                      if (!matcher.find()) {
72                          continue;
73                      }
74                      String lemma = matcher.group(1);
75                      String pos = matcher.group(2);
76  
77                      if (!allowedPos.contains(pos)) {
78                          continue;
79                      }
80  
81                      boolean done = false;
82                      if (i < LIMIT_EASY) {
83                          level1Lemmas.add(lemma);
84                          done = true;
85                      }
86                      if (i < LIMIT_MEDIUM) {
87                          level2Lemmas.add(lemma);
88                          done = true;
89                      }
90                      if (i < LIMIT_HARD) {
91                          level3Lemmas.add(lemma);
92                          done = true;
93                      }
94  
95                      if (!done) {
96                          break;
97                      }
98  
99                      i++;
100                 }
101                 reader.close();
102             } catch (Exception e) {
103                 LOGGER.warn("Unable to load easyWords file: {}", e.getMessage());
104             }
105 
106             ourInstance = new GalicianReadabilityModel(level1Lemmas, level2Lemmas, level3Lemmas);
107         } else {
108             LOGGER.info("Readability model already loaded");
109         }
110         return ourInstance;
111     }
112 
113     private GalicianReadabilityModel(Set<String> level1Lemmas, Set<String> level2Lemmas,
114                                      Set<String> level3Lemmas) {
115         this.level1Lemmas = level1Lemmas;
116         this.level2Lemmas = level2Lemmas;
117         this.level3Lemmas = level3Lemmas;
118     }
119 
120     public Set<String> getLevel1Lemmas() {
121         return level1Lemmas;
122     }
123 
124     public Set<String> getLevel2Lemmas() {
125         return level2Lemmas;
126     }
127 
128     public Set<String> getLevel3Lemmas() {
129         return level3Lemmas;
130     }
131 }