1   package eu.fbk.dh.tint.readability.it;
2   
3   import com.google.common.collect.HashMultimap;
4   import com.google.gson.Gson;
5   import com.google.gson.stream.JsonReader;
6   import edu.stanford.nlp.ling.CoreAnnotations;
7   import edu.stanford.nlp.ling.CoreLabel;
8   import edu.stanford.nlp.pipeline.Annotation;
9   import edu.stanford.nlp.pipeline.StanfordCoreNLP;
10  import eu.fbk.dh.tint.readability.GlossarioEntry;
11  import eu.fbk.dh.tint.readability.Readability;
12  import eu.fbk.utils.core.PropertiesUtils;
13  import org.slf4j.Logger;
14  import org.slf4j.LoggerFactory;
15  
16  import java.io.InputStream;
17  import java.io.InputStreamReader;
18  import java.util.Arrays;
19  import java.util.HashMap;
20  import java.util.List;
21  import java.util.Properties;
22  
23  /**
24   * Created by alessio on 26/09/16.
25   */
26  public class ItalianReadabilityModel {
27  
28      private static final Logger LOGGER = LoggerFactory.getLogger(ItalianReadabilityModel.class);
29      private static ItalianReadabilityModel ourInstance = null;
30      private HashMap<String, GlossarioEntry> glossario = new HashMap<>();
31      private HashMap<Integer, HashMultimap<String, String>> easyWords = new HashMap<>();
32  
33      public static ItalianReadabilityModel getInstance(Properties globalProperties, Properties localProperties) {
34          if (ourInstance == null) {
35              boolean useGlossario = PropertiesUtils.getBoolean(localProperties.getProperty("glossario.use"), false);
36              String easyWordsFileName = localProperties.getProperty("easyWords");
37  
38              Gson gson = new Gson();
39  
40              // Loading simple words
41  
42              EasyLanguage easyLanguage = new EasyLanguage();
43              LOGGER.info("Loading easy lemmas");
44              try {
45                  InputStream stream = Readability.getStream(easyWordsFileName, "/models/easy-output.json");
46                  JsonReader reader = new JsonReader(new InputStreamReader(stream));
47                  easyLanguage = gson.fromJson(reader, EasyLanguage.class);
48              } catch (Exception e) {
49                  LOGGER.warn("Unable to load easyWords file: {}", e.getMessage());
50              }
51  
52              HashMap<Integer, HashMultimap<String, String>> easyWords = new HashMap<>();
53  
54              easyWords.put(1, HashMultimap.create());
55              easyWords.get(1).putAll("S", Arrays.asList(easyLanguage.level1.n));
56  //        easyWords.get(1).putAll("A", Arrays.asList(easyLanguage.level1.a));
57  //        easyWords.get(1).putAll("B", Arrays.asList(easyLanguage.level1.r));
58              easyWords.get(1).putAll("V", Arrays.asList(easyLanguage.level1.v));
59              easyWords.put(2, HashMultimap.create());
60              easyWords.get(2).putAll("S", Arrays.asList(easyLanguage.level2.n));
61              easyWords.get(2).putAll("A", Arrays.asList(easyLanguage.level2.a));
62              easyWords.get(2).putAll("B", Arrays.asList(easyLanguage.level2.r));
63              easyWords.get(2).putAll("V", Arrays.asList(easyLanguage.level2.v));
64              easyWords.put(3, HashMultimap.create());
65              easyWords.get(3).putAll("S", Arrays.asList(easyLanguage.level3.n));
66              easyWords.get(3).putAll("A", Arrays.asList(easyLanguage.level3.a));
67              easyWords.get(3).putAll("B", Arrays.asList(easyLanguage.level3.r));
68              easyWords.get(3).putAll("V", Arrays.asList(easyLanguage.level3.v));
69  
70              // Loading glossario
71  
72              HashMap<String, GlossarioEntry> glossario = new HashMap<>();
73  
74              if (useGlossario) {
75                  Properties stanfordProperties = PropertiesUtils
76                          .dotConvertedProperties(localProperties, "glossario.stanford");
77                  for (String key : globalProperties.stringPropertyNames()) {
78                      if (stanfordProperties.getProperty(key) == null) {
79                          stanfordProperties.setProperty(key, globalProperties.getProperty(key));
80                      }
81                  }
82  
83                  String glossarioFileName = localProperties.getProperty("glossario");
84  
85                  Boolean parseGlossario = PropertiesUtils
86                          .getBoolean(localProperties.getProperty("glossario.parse", "true"), true);
87  
88                  StanfordCoreNLP pipeline = new StanfordCoreNLP(stanfordProperties);
89                  LOGGER.info("Loading glossario");
90                  try {
91                      InputStream stream = Readability
92                              .getStream(glossarioFileName, "/models/glossario-parsed-edited.json");
93                      JsonReader reader = new JsonReader(new InputStreamReader(stream));
94                      GlossarioEntry[] entries = gson.fromJson(reader, GlossarioEntry[].class);
95                      for (GlossarioEntry entry : entries) {
96                          for (String form : entry.getForms()) {
97  //                            if (!form.equals("nido d'infanzia")) {
98  //                                continue;
99  //                            }
100 
101                             if (parseGlossario) {
102                                 Annotation annotation = new Annotation(form);
103                                 pipeline.annotate(annotation);
104 
105                                 StringBuffer lemmaBuffer = new StringBuffer();
106                                 StringBuffer tokenBuffer = new StringBuffer();
107 
108                                 List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
109                                 for (CoreLabel token : tokens) {
110                                     lemmaBuffer.append(token.get(CoreAnnotations.LemmaAnnotation.class)).append(" ");
111                                     tokenBuffer.append(token.get(CoreAnnotations.TextAnnotation.class)).append(" ");
112                                 }
113 
114                                 String pos = entry.getPos();
115                                 String annotatedPos = tokens.get(0).get(CoreAnnotations.PartOfSpeechAnnotation.class);
116                                 if (pos == null || annotatedPos.substring(0, 1).equals("S")) {
117                                     glossario.put(lemmaBuffer.toString().trim(), entry);
118                                 }
119                                 glossario.put(tokenBuffer.toString().trim(), entry);
120                             } else {
121                                 // todo: check this, it was outside the "if"
122                                 glossario.put(form, entry);
123                             }
124                         }
125                     }
126 
127                 } catch (Exception e) {
128                     LOGGER.warn("Unable to load glossario file: {}", e.getMessage());
129                 }
130             }
131 
132             ourInstance = new ItalianReadabilityModel(glossario, easyWords);
133         } else {
134             LOGGER.info("Readability model already loaded");
135         }
136         return ourInstance;
137     }
138 
139     private ItalianReadabilityModel(
140             HashMap<String, GlossarioEntry> glossario,
141             HashMap<Integer, HashMultimap<String, String>> easyWords) {
142         this.glossario = glossario;
143         this.easyWords = easyWords;
144     }
145 
146     public HashMap<String, GlossarioEntry> getGlossario() {
147         return glossario;
148     }
149 
150     public HashMap<Integer, HashMultimap<String, String>> getEasyWords() {
151         return easyWords;
152     }
153 }