1   package eu.fbk.dh.tint.digimorph;
2   
3   import com.google.common.collect.Lists;
4   import com.google.common.io.Resources;
5   import org.apache.commons.csv.CSVFormat;
6   import org.apache.commons.csv.CSVRecord;
7   import org.mapdb.Serializer;
8   import org.mapdb.SortedTableMap;
9   import org.mapdb.volume.MappedFileVol;
10  import org.mapdb.volume.Volume;
11  import org.slf4j.Logger;
12  import org.slf4j.LoggerFactory;
13  
14  import java.io.File;
15  import java.io.FileReader;
16  import java.io.IOException;
17  import java.io.Reader;
18  import java.nio.file.Files;
19  import java.util.*;
20  import java.util.concurrent.*;
21  
22  /**
23   * @author Giovanni Moretti at Digital Humanities group at FBK.
24   * @version 0.4a
25   */
26  public class DigiMorph {
27  
28      String model_path = "";
29      ExecutorService executor = null;
30      List<Future<List<String>>> futures = null;
31  
32      Set<Callable<List<String>>> callables = new HashSet<Callable<List<String>>>();
33      Volume volume = null;
34      SortedTableMap<String, String> map = null;
35  
36      private static final Logger LOGGER = LoggerFactory.getLogger(DigiMorph.class);
37  
38  
39      public static String getVersion() {
40          return DigiMorph.class.getPackage().getImplementationTitle() + "\n"
41                  + DigiMorph.class.getPackage().getSpecificationVendor() + " - "
42                  + DigiMorph.class.getPackage().getImplementationVendor() + "\n"
43                  + "Version: " + DigiMorph.class.getPackage().getSpecificationVersion();
44      }
45  
46      public DigiMorph() {
47          this(null);
48      }
49  
50      public DigiMorph(String model_path) {
51          if (model_path == null) {
52              try {
53                  File file = File.createTempFile("mapdb", "mapdb");
54                  file.deleteOnExit();
55                  byte[] bytes = Resources.toByteArray(Resources.getResource("italian.db"));
56                  Files.write(file.toPath(), bytes);
57                  model_path = file.getAbsolutePath();
58              } catch (IOException e) {
59                  e.printStackTrace();
60              }
61          }
62  
63          this.model_path = model_path;
64          volume = MappedFileVol.FACTORY.makeVolume(model_path, true);
65          this.map = SortedTableMap.open(volume, Serializer.STRING, Serializer.STRING);
66  
67      }
68  
69      public SortedTableMap<String, String> getMap() {
70          return map;
71      }
72  
73      /**
74       * @param token_list list of string containing words.
75       * @return list of string containing the results of the Morphological analyzer.
76       * @author Giovanni Moretti
77       * @version 0.42a
78       */
79  
80      synchronized public List<String> getMorphology(List<String> token_list) {
81          List<String> results = new LinkedList<>();
82          int threadsNumber = Runtime.getRuntime().availableProcessors();
83          List<List<String>> parts = Lists.partition(token_list, (token_list.size() / threadsNumber) + 1);
84  
85          if (token_list.size() > 0) {
86              executor = Executors.newFixedThreadPool(parts.size());
87          } else {
88              LOGGER.warn("No tokens to the morphological analyzer");
89              return results;
90          }
91  
92          callables = new LinkedHashSet<>();
93  
94          for (int pts = 0; pts < parts.size(); pts++) {
95              callables.add(new DigiMorph_Analizer(parts.get(pts), map));
96          }
97  
98          try {
99  
100             futures = executor.invokeAll(callables);
101 
102             executor.shutdown();
103             executor.awaitTermination(Integer.MAX_VALUE, TimeUnit.SECONDS);
104 
105             executor.shutdownNow();
106         } catch (Exception e) {
107             e.printStackTrace();
108         }
109 
110         try {
111             for (Future<List<String>> future : futures) {
112                 List<String> stringList = future.get();
113                 results.addAll(stringList);
114             }
115         } catch (Exception e) {
116             e.printStackTrace();
117         }
118 //        for (int pts = 0; pts < parts.size(); pts++) {
119 //            parts.get(pts).clear();
120 //        }
121 
122         return results;
123     }
124 
125 //    Map<String, String> mapcodgram = new HashMap<String, String>();
126 //    Map<String, String> mapcodfless = new HashMap<String, String>();
127 
128     /**
129      * This method creates or re-creates the db file with the morphology forms used by the analyzer
130      *
131      * @param csv_path - String contains the tsv file path
132      */
133 
134     public static void re_train(File csv_path, File output, boolean include_lemma) {
135         if (output.exists()) {
136             output.delete();
137         }
138         Volume volume = MappedFileVol.FACTORY.makeVolume(output.getAbsolutePath(), false);
139         SortedTableMap.Sink<String, String> sink =
140                 SortedTableMap.create(
141                         volume,
142                         Serializer.STRING, // key serializer
143                         Serializer.STRING   // value serializer
144                 )
145                         .pageSize(64 * 1024)
146                         .nodeSize(8)
147                         .createFromSink();
148 
149         SortedMap<String, String> map = new TreeMap<String, String>();
150         try {
151             Reader in = new FileReader(csv_path);
152             Iterable<CSVRecord> records = CSVFormat.TDF.withIgnoreEmptyLines().withQuote('≥').parse(in);
153             for (CSVRecord record : records) {
154 
155                 String feature = record.get(2);
156                 String lemma = record.get(1);
157                 String forma = record.get(0).toLowerCase();
158                 if (!map.containsKey(forma)) {
159                     map.put(forma, "");
160                 }
161                 if (lemma == null) {
162                     lemma = "";
163                 }
164 
165                 if (include_lemma) {
166                     map.put(forma, map.get(forma) + " " + lemma + "+" + feature);
167                 } else {
168                     map.put(forma, map.get(forma) + " " + feature);
169                 }
170             }
171 
172             for (Map.Entry<String, String> e : map.entrySet()) {
173                 sink.put(e.getKey(), e.getValue());
174             }
175 
176         } catch (Exception e) {
177             e.printStackTrace();
178         }
179 
180         SortedTableMap<String, String> stmap = sink.create();
181         //volume.close();
182 
183         System.out.println("done");
184 
185     }
186 }