1   package eu.fbk.dh.tint.digimorph.annotator;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Resources;
5   import eu.fbk.dh.tint.digimorph.DigiMorph;
6   import eu.fbk.utils.core.FrequencyHashSet;
7   import org.mapdb.SortedTableMap;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  
11  import java.io.BufferedReader;
12  import java.io.File;
13  import java.io.FileReader;
14  import java.io.IOException;
15  import java.net.URL;
16  import java.util.*;
17  
18  
19  public class ModelHelper {
20  
21      private static final Logger LOGGER = LoggerFactory.getLogger(ModelHelper.class);
22  
23      public static void main(String[] args) {
24  
25          File conllFile = new File("/Volumes/Dati/Resources/ud-treebanks-v2.1/UD_Italian/it-ud-train.conllu");
26  
27          Set<String> detAdj = new HashSet<>();
28          URL adjResource = Resources.getResource("det-adj.txt");
29          try {
30              for (String line : Resources.readLines(adjResource, Charsets.UTF_8)) {
31                  line = line.trim();
32                  if (line.length() == 0) {
33                      continue;
34                  }
35                  detAdj.add(line);
36              }
37  
38          } catch (IOException e) {
39              e.printStackTrace();
40          }
41  
42          DigiMorph digiMorph = new DigiMorph();
43          Map<String, FrequencyHashSet<String>> formToFeats = new HashMap<>();
44          Map<String, Map<String, FrequencyHashSet<String>>> formToForms = new HashMap<>();
45  //        HashMultimap<String, String> formToForms = HashMultimap.create();
46  
47          try {
48              BufferedReader reader = new BufferedReader(new FileReader(conllFile));
49  
50              List<String> forms = new ArrayList<>();
51              List<String> feats = new ArrayList<>();
52  
53              String line;
54              while ((line = reader.readLine()) != null) {
55                  line = line.trim();
56                  if (line.startsWith("#")) {
57                      continue;
58                  }
59  
60                  String[] parts = line.split("\t");
61                  if (parts.length < 10) {
62                      continue;
63                  }
64  
65                  forms.add(parts[1]);
66                  feats.add(parts[5]);
67              }
68  
69              List<String> morpho = digiMorph.getMorphology(forms);
70  
71              for (int i = 0; i < morpho.size(); i++) {
72                  String mor = morpho.get(i);
73                  String fea = feats.get(i);
74                  String form = forms.get(i);
75  
76                  String[] words = mor.split("[\\s/]+");
77                  if (words.length > 2) {
78                      continue;
79                  }
80                  for (String word : words) {
81                      String[] parts = word.split("\\+");
82  
83                      if (parts.length < 2) {
84                          continue;
85                      }
86  
87                      if (detAdj.contains(parts[0])) {
88                          continue;
89                      }
90  
91                      String subToken = word.replaceAll("^[^~]*~", "");
92                      subToken = subToken.replaceAll("^[^+]*\\+", "");
93  
94                      formToFeats.putIfAbsent(subToken, new FrequencyHashSet<>());
95                      formToFeats.get(subToken).add(fea);
96                      formToForms.putIfAbsent(subToken, new HashMap<>());
97                      formToForms.get(subToken).putIfAbsent(form, new FrequencyHashSet<>());
98                      formToForms.get(subToken).get(form).add(fea);
99                  }
100             }
101 
102             reader.close();
103         } catch (Exception e) {
104             e.printStackTrace();
105             System.exit(1);
106         }
107 
108         HashMap<String, String> uMap = new HashMap<>();
109         uMap.put("v", "VERB");
110         uMap.put("adv", "ADV");
111         uMap.put("adj", "ADJ");
112         uMap.put("n", "NOUN");
113 
114         Set<String> featsList = new HashSet<>();
115 //                Map<String, String> featMap = new HashMap<>();
116 
117         SortedTableMap<String, String> map = digiMorph.getMap();
118         Iterator<String> gmIterator = map.keyIterator();
119         while (gmIterator.hasNext()) {
120             String key = gmIterator.next();
121             String value = map.get(key).trim();
122             String[] words = value.split("[\\s/]+");
123             for (String word : words) {
124                 String[] parts = word.split("\\+");
125 
126                 if (parts.length < 2) {
127                     continue;
128                 }
129                 String ePos = parts[1];
130                 if (ePos.length() == 0) {
131                     continue;
132                 }
133                 if (!uMap.keySet().contains(ePos)) {
134                     continue;
135                 }
136 
137                 // Feats
138                 String subToken = word.replaceAll("^[^~]*~", "");
139                 subToken = subToken.replaceAll("^[^+]*\\+", "");
140                 featsList.add(subToken);
141 
142 //                String token = key.toLowerCase();
143 //                String lemma = parts[0].toLowerCase();
144 //                String reverse_token = new StringBuilder(token).reverse().toString();
145 //                LinkedList<String> features = new LinkedList<>();
146 //                features.add(lemma);
147 //                features.add(token);
148             }
149         }
150 
151         for (String feats : featsList) {
152             System.out.println(feats);
153             if (formToFeats.get(feats) != null) {
154                 String mostFrequent = formToFeats.get(feats).mostFrequent();
155                 System.out.println("Most frequent: " + mostFrequent);
156                 System.out.println("Frequency: " + formToFeats.get(feats).get(mostFrequent) + "/" + formToFeats.get(feats).sum());
157                 System.out.println(formToFeats.get(feats));
158                 System.out.println(formToForms.get(feats));
159             }
160             System.out.println();
161         }
162 
163 //        System.out.println(featsList);
164 //        System.out.println(featsList.size());
165     }
166 }