1   /*
2    * Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package eu.fbk.dh.tint.tokenizer;
18  
19  import edu.stanford.nlp.ling.CoreLabel;
20  import edu.stanford.nlp.process.CoreLabelTokenFactory;
21  import eu.fbk.dh.tint.tokenizer.token.CharacterTable;
22  import eu.fbk.dh.tint.tokenizer.token.Token;
23  import eu.fbk.dh.tint.tokenizer.token.TokenGroup;
24  import eu.fbk.utils.core.PropertiesUtils;
25  import org.ahocorasick.trie.Emit;
26  import org.ahocorasick.trie.Trie;
27  import org.apache.commons.lang.mutable.MutableBoolean;
28  import org.slf4j.Logger;
29  import org.slf4j.LoggerFactory;
30  import org.w3c.dom.Document;
31  import org.w3c.dom.Element;
32  import org.w3c.dom.Node;
33  import org.w3c.dom.NodeList;
34  
35  import javax.annotation.Nullable;
36  import javax.xml.parsers.DocumentBuilder;
37  import javax.xml.parsers.DocumentBuilderFactory;
38  import javax.xml.xpath.XPath;
39  import javax.xml.xpath.XPathConstants;
40  import javax.xml.xpath.XPathExpression;
41  import javax.xml.xpath.XPathFactory;
42  import java.io.*;
43  import java.util.*;
44  import java.util.regex.Matcher;
45  import java.util.regex.Pattern;
46  
47  /**
48   * Written by Alessio Palmero Aprosio
49   * <p>
50   * partially based on HardTokenizer, part of the twm-lib package,
51   * written by Claudio Giuliano and Alessio Palmero Aprosio.
52   */
53  public class ItalianTokenizer {
54  
55      /**
56       * Define a static logger variable so that it references the
57       * Logger instance named <code>HardTokenizer</code>.
58       */
59      static Logger logger = LoggerFactory.getLogger(ItalianTokenizer.class);
60      static Pattern spaceTokenizer = Pattern.compile("\\s+");
61  
62      private Trie trie;
63      private Set<Integer> splittingChars = new HashSet<>();
64      private Set<Integer> sentenceChars = new HashSet<>();
65      private Map<Integer, String> normalizedChars = new HashMap<>();
66      private Map<String, String> normalizedStrings = new HashMap<>();
67      private Map<Pattern, Integer> expressions = new HashMap<>();
68  
69      CoreLabelTokenFactory factory = new CoreLabelTokenFactory();
70  
71      public ItalianTokenizer() {
72          this(null);
73      }
74  
75      public ItalianTokenizer(@Nullable File settingFile) {
76          Trie.TrieBuilder builder = Trie.builder().removeOverlaps();
77  
78          InputStream stream = null;
79          if (settingFile != null) {
80              try {
81                  stream = new FileInputStream(settingFile);
82              } catch (FileNotFoundException e) {
83                  // continue
84              }
85          }
86          if (stream == null) {
87              stream = this.getClass().getResourceAsStream("/token-settings.xml");
88          }
89  
90          logger.trace("Loading model");
91          try {
92              DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
93              DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
94              XPathFactory xPathfactory = XPathFactory.newInstance();
95              XPath xpath = xPathfactory.newXPath();
96  
97              XPathExpression expr;
98              NodeList nl;
99              int count;
100 
101             Document doc = dBuilder.parse(stream);
102             doc.getDocumentElement().normalize();
103 
104             // Normalization rules
105             expr = xpath.compile("/settings/normalization/char");
106             nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
107             for (int i = 0; i < nl.getLength(); i++) {
108                 Node item = nl.item(i);
109                 Element element = (Element) item;
110                 String hexCode = element.getAttribute("hexcode");
111                 String content = element.getTextContent();
112 
113                 // Bad: need fix
114                 if (content.equals("`")) {
115                     content = "'";
116                 }
117 
118                 int num = Integer.parseInt(hexCode, 16);
119                 if (content.length() == 0) {
120                     continue;
121                 }
122                 normalizedChars.put(num, content);
123             }
124             logger.info("Loaded {} normalization rules", normalizedChars.size());
125 
126             // end sentence chars
127             expr = xpath.compile("/settings/sentenceSplitting/char");
128             nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
129             for (int i = 0; i < nl.getLength(); i++) {
130                 Node item = nl.item(i);
131                 Element element = (Element) item;
132                 String charID = element.getAttribute("id");
133                 sentenceChars.add(Integer.parseInt(charID));
134             }
135             logger.info("Loaded {} sentence splitting rules", sentenceChars.size());
136 
137             // splitting rules
138             expr = xpath.compile("/settings/tokenSplitting/char");
139             nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
140             for (int i = 0; i < nl.getLength(); i++) {
141                 Node item = nl.item(i);
142                 Element element = (Element) item;
143                 String charID = element.getAttribute("id");
144                 splittingChars.add(Integer.parseInt(charID));
145             }
146             logger.info("Loaded {} token splitting rules", splittingChars.size());
147 
148             // expressions
149             expr = xpath.compile("/settings/expressions/expression");
150             nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
151             StringBuilder b = new StringBuilder();
152             b.append("(");
153             boolean first = true;
154             count = 0;
155             for (int i = 0; i < nl.getLength(); i++) {
156                 Node item = nl.item(i);
157                 Element element = (Element) item;
158                 String regExp = element.getAttribute("find");
159                 boolean merge = PropertiesUtils.getBoolean(element.getAttribute("merge"), true);
160                 Integer group = PropertiesUtils.getInteger(element.getAttribute("get"), 1);
161                 if (merge) {
162                     if (!first) {
163                         b.append("|");
164                     }
165                     b.append(regExp);
166                     count++;
167                     first = false;
168                 } else {
169                     expressions.put(Pattern.compile(regExp), group);
170                     count++;
171                 }
172             }
173             b.append(")");
174             expressions.put(Pattern.compile(b.toString()), 1);
175             logger.info("Loaded {} regular expressions", count);
176 
177             // abbreviations
178             expr = xpath.compile("/settings/abbreviations/abbreviation");
179             nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
180             count = 0;
181             for (int i = 0; i < nl.getLength(); i++) {
182                 Node item = nl.item(i);
183                 String abbr = item.getTextContent();
184                 abbr = getString(tokenArray(abbr));
185                 builder.addKeyword(" " + abbr + " ");
186                 count++;
187             }
188             logger.info("Loaded {} abbreviations", count);
189 
190         } catch (Exception e) {
191             e.printStackTrace();
192         }
193 
194         trie = builder.build();
195     }
196 
197     public TokenGroup tokenArray(String text) {
198 
199         if (text.length() == 0) {
200             return new TokenGroup();
201         }
202 
203 //        List<Token> list = new ArrayList<Token>();
204         TokenGroup tokenGroup = new TokenGroup();
205 
206         Character currentChar;
207         Character previousChar = null;
208         int start = 0;
209         Boolean isCurrentCharLetterOrDigit;
210         Boolean isPreviousCharLetterOrDigit;
211 
212         MutableBoolean isNewLine = new MutableBoolean(false);
213         Token lastToken = new Token(0, 0, "");
214 
215         //logger.debug("0\t" + (int) previousChar + "\t<" + previousChar + ">");
216         for (int i = 0; i < text.length(); i++) {
217 
218             currentChar = text.charAt(i);
219             isCurrentCharLetterOrDigit = Character.isLetterOrDigit(currentChar);
220             isPreviousCharLetterOrDigit = previousChar != null && Character.isLetterOrDigit(previousChar);
221 
222             if (isCurrentCharLetterOrDigit) {
223                 if (!isPreviousCharLetterOrDigit) {
224                     start = i;
225                 }
226             } else {
227                 if (isPreviousCharLetterOrDigit) {
228                     String substring = text.substring(start, i);
229                     addToken(tokenGroup, start, i, substring, isNewLine, lastToken);
230 
231                     if (!splittingChars.contains(currentChar.hashCode())) {
232                         String charString = new String(new char[]{currentChar});
233                         addToken(tokenGroup, i, i + 1, charString, isNewLine, lastToken);
234                     }
235                 } else {
236                     if (!splittingChars.contains(currentChar.hashCode())) {
237                         String charString = new String(new char[]{currentChar});
238                         addToken(tokenGroup, i, i + 1, charString, isNewLine, lastToken);
239                     }
240                 }
241             }
242 
243             if (sentenceChars.contains(currentChar.hashCode())) {
244                 isNewLine.setValue(true);
245             }
246 
247             previousChar = currentChar;
248         }
249         if (Character.isLetterOrDigit(previousChar)) {
250             String substring = text.substring(start, text.length());
251             addToken(tokenGroup, start, text.length(), substring, isNewLine, lastToken);
252         }
253 
254         return tokenGroup;
255     }
256 
257     private void addToken(TokenGroup group, int start, int end, String charString, MutableBoolean isNewLine,
258                           Token lastToken) {
259         Token token = new Token(start, end, charString);
260         if (isNewLine.booleanValue()) {
261             group.addNewLine(start);
262             isNewLine.setValue(false);
263         }
264         token.setPreceedBySpace(start - lastToken.getEnd() > 0);
265 
266         int spaces = 0;
267         if (lastToken != null && lastToken.getEnd() != 0) {
268             int endLast = lastToken.getEnd();
269             spaces = lastToken.getSpaceOffset();
270             if (start == endLast) {
271                 spaces++;
272             } else {
273                 spaces -= Math.max(0, start - endLast - 1);
274             }
275         }
276         token.setSpaceOffset(spaces);
277 
278         // Normalization
279         String n;
280         if (charString.length() == 1) {
281             int c = charString.charAt(0);
282             n = normalizedChars.get(c);
283         } else {
284             n = normalizedStrings.get(charString);
285         }
286         if (n != null) {
287             token.setNormForm(n);
288         }
289 
290         lastToken.updateByToken(token);
291         group.addToken(token);
292     }
293 
294     public static String getString(TokenGroup tokenGroup) {
295         StringBuilder buffer = new StringBuilder();
296         ArrayList<Token> tokens = tokenGroup.getSupport();
297 
298         // todo: check this
299         if (tokens.size() > 0) {
300             for (int i = 0; i < tokens.size() - 1; i++) {
301                 Token token = tokens.get(i);
302                 buffer.append(token.getForm()).append(CharacterTable.SPACE);
303             }
304             buffer.append(tokens.get(tokens.size() - 1).getForm());
305         }
306         return buffer.toString();
307     }
308 
309 //    public boolean isSeparatorChar(Character ch) {
310 //        if (splittingChars.size() > 0) {
311 //            return splittingChars.contains(ch.hashCode());
312 //        } else if (ch == CharacterTable.SPACE) {
313 //            return true;
314 //        } else if (ch == CharacterTable.CARRIADGE_RETURN) {
315 //            return true;
316 //        } else if (ch == CharacterTable.LINE_FEED) {
317 //            return true;
318 //        } else if (ch == CharacterTable.HORIZONTAL_TABULATION) {
319 //            return true;
320 //        } else if (ch == CharacterTable.FORM_FEED) {
321 //            return true;
322 //        }
323 //
324 //        return false;
325 //    }
326 
327     public List<List<CoreLabel>> parse(String text) {
328         return parse(text, true, false, false);
329     }
330 
331     public List<List<CoreLabel>> parse(String text, boolean newlineIsSentenceBreak, boolean tokenizeOnlyOnSpace,
332                                        boolean ssplitOnlyOnNewLine) {
333 
334         List<List<CoreLabel>> ret = new ArrayList<>();
335         List<CoreLabel> temp = new ArrayList<>();
336 
337         int index = 0;
338 
339         if (tokenizeOnlyOnSpace) {
340 
341             int nextStart = 0;
342             boolean lastIsWhitespace = true;
343             int newLineCount = 0;
344 
345             for (int i = 0; i < text.length(); i++) {
346                 char currentChar = text.charAt(i);
347                 boolean isLast = i == text.length() - 1;
348                 if (Character.isWhitespace(currentChar) || isLast) {
349                     if (!lastIsWhitespace) {
350 //                        System.out.println("---" + text.substring(nextStart, i) + "---" + newLineCount);
351 
352                         int j = i;
353                         if (isLast) {
354                             j++;
355                         }
356                         String word = text.substring(nextStart, j);
357                         CoreLabel clToken = factory.makeToken(word, word, nextStart, j - nextStart);
358                         clToken.setIndex(++index);
359 
360                         if (newlineIsSentenceBreak && newLineCount > 0) {
361                             if (temp.size() > 0) {
362                                 ret.add(temp);
363                                 index = 0; // index must be zeroed to meet Stanford policy
364                                 temp = new ArrayList<>();
365                             }
366                         }
367 
368                         temp.add(clToken);
369 
370                         if (!ssplitOnlyOnNewLine) {
371                             if (word.length() == 1 && sentenceChars.contains((int) word.charAt(0))) {
372                                 ret.add(temp);
373                                 index = 0; // index must be zeroed to meet Stanford policy
374                                 temp = new ArrayList<>();
375                             }
376                         }
377 
378                         newLineCount = 0;
379                     }
380                     if (currentChar == '\n') {
381                         newLineCount++;
382                     }
383                     lastIsWhitespace = true;
384                 } else {
385                     if (lastIsWhitespace) {
386                         nextStart = i;
387                     }
388                     lastIsWhitespace = false;
389                 }
390             }
391 
392             if (temp.size() > 0) {
393                 ret.add(temp);
394             }
395         } else {
396             HashMap<Integer, Integer> mergeList = new HashMap<>();
397 
398             for (Pattern expression : expressions.keySet()) {
399                 int get = expressions.get(expression);
400                 Matcher matcher = expression.matcher(text);
401                 while (matcher.find()) {
402                     mergeList.put(matcher.start(get), matcher.end(get));
403                 }
404             }
405 
406             TokenGroup tokenGroup = tokenArray(text);
407             ArrayList<Token> tokens = tokenGroup.getSupport();
408 
409             if (tokens.size() == 0) {
410                 return ret;
411             }
412 
413             int offset = tokens.get(0).getStart();
414             String s = " " + getString(tokenGroup) + " ";
415 
416             Collection<Emit> emits = trie.parseText(s);
417             for (Emit emit : emits) {
418                 // Added -1 for compatibility with the "s" string
419 
420                 Token startToken = tokenGroup.getStartOffIndexes().get(emit.getStart() + 1 - 1 + offset);
421 //                Token endToken = tokenGroup.getEndOffIndexes().get(emit.getEnd() - 1 - 1 + offset);
422 
423                 Token endToken = null;
424                 int endOffset = 0;
425                 while (endToken == null) {
426                     endToken = tokenGroup.getEndOffIndexes().get(emit.getEnd() - endOffset - 1 + offset);
427                     endOffset++;
428                 }
429 
430                 if (newlineIsSentenceBreak) {
431                     String substring = text.substring(startToken.getStart(), endToken.getEnd());
432                     if (substring.contains("\n") || substring.contains("\r")) {
433                         continue;
434                     }
435                 }
436                 if (startToken != null && endToken != null) {
437                     mergeList.put(startToken.getStart(), endToken.getEnd());
438                 } else {
439                     logger.warn("Something is null! -- " + emit.toString());
440                 }
441             }
442 
443             Integer end = null;
444             Integer start = null;
445 
446             Set<Integer> newLines = tokenGroup.getNewLines();
447 
448             for (int i = 0; i < tokens.size(); i++) {
449                 Token token = tokens.get(i);
450                 boolean merging = false;
451 
452                 if (mergeList.containsKey(token.getStart()) || end != null) {
453                     merging = true;
454                     if (end == null) {
455                         end = mergeList.get(token.getStart());
456                     }
457                 }
458 
459                 if (merging && (end != null && token.getEnd() >= end)) {
460                     end = null;
461                     merging = false;
462                 }
463 
464                 if (token.getNormForm().equals("'")) {
465 
466                     Token prevToken = null,
467                             nextToken = null;
468                     if (i > 0) {
469                         prevToken = tokens.get(i - 1);
470                     }
471                     if (i < tokens.size() - 1) {
472                         nextToken = tokens.get(i + 1);
473                     }
474 
475                     // Example: l'economia
476                     if (prevToken != null &&
477                             nextToken != null &&
478                             !token.isPreceedBySpace() &&
479                             !nextToken.isPreceedBySpace() &&
480                             Character.isLetter(prevToken.getForm().charAt(prevToken.getForm().length() - 1)) &&
481                             Character.isLetter(nextToken.getForm().charAt(0))) {
482                         CoreLabel lastToken = temp.get(temp.size() - 1);
483                         start = lastToken.beginPosition();
484                         temp.remove(temp.size() - 1);
485                         index--;
486                     }
487 
488                     // Example: sta'
489                     else if (prevToken != null &&
490                             Character.isLetter(prevToken.getForm().charAt(prevToken.getForm().length() - 1)) &&
491                             !token.isPreceedBySpace() &&
492                             (nextToken == null || !nextToken.getNormForm().equals("'"))) {
493                         CoreLabel lastToken = temp.get(temp.size() - 1);
494                         start = lastToken.beginPosition();
495                         temp.remove(temp.size() - 1);
496                         index--;
497                     }
498 
499                     // Example: 'ndrangheta
500                     else if (nextToken != null &&
501                             Character.isLetter(nextToken.getForm().charAt(0)) &&
502                             !nextToken.isPreceedBySpace() &&
503                             (prevToken == null || !prevToken.getNormForm().equals("'"))) {
504                         merging = true;
505                     }
506                 }
507 
508                 if (merging) {
509                     if (start == null) {
510                         start = token.getStart();
511                     }
512                     continue;
513                 }
514 
515                 if (start == null) {
516                     start = token.getStart();
517                 } else {
518                     newLines.remove(token.getEnd() + 1);
519                 }
520 
521                 int finish = token.getEnd();
522                 String word = text.substring(start, finish);
523                 String normWord = word;
524 
525                 // todo: bad
526                 // solves https://github.com/dhfbk/tint/issues/3
527                 if (word.charAt(word.length() - 1) == '’' || word.charAt(word.length() - 1) == '`') {
528                     normWord = word.substring(0, word.length() - 1) + "'";
529                 }
530                 CoreLabel clToken = factory.makeToken(normWord, word, start, finish - start);
531                 clToken.setIndex(++index);
532 
533                 if (newlineIsSentenceBreak && newLines.contains(start)) {
534                     if (temp.size() > 0) {
535                         ret.add(temp);
536                         index = 0; // index must be zeroed to meet Stanford policy
537                         temp = new ArrayList<>();
538                     }
539                 }
540 
541                 temp.add(clToken);
542 
543                 if (!ssplitOnlyOnNewLine) {
544                     if (word.length() == 1 && sentenceChars.contains((int) word.charAt(0))) {
545                         ret.add(temp);
546                         index = 0; // index must be zeroed to meet Stanford policy
547                         temp = new ArrayList<>();
548                     }
549                 }
550 
551                 start = null;
552             }
553 
554             if (temp.size() > 0) {
555                 ret.add(temp);
556                 index = 0; // index must be zeroed to meet Stanford policy
557             }
558         }
559 
560         return ret;
561     }
562 
563     public static void main(String argv[]) throws IOException {
564 
565         ItalianTokenizer tokenizer = new ItalianTokenizer();
566 
567 //        byte[] file = Files.readAllBytes((new File("/Users/alessio/Desktop/milano.txt")).toPath());
568 //        String text = new String(file);
569         String text = "Clinton in testa nei sondaggi dopo l’«assoluzione» dell’Fbi sull’uso di un server di posta privato quando era Segretario di stato.";
570 //        text = "``Determinato, pronto a «fare tutto il necessario per mantenere la stabilità dei prezzi».''"
571 //                + " Ma anche allarmato per come le conseguenze del referendum britannico minacciano l’economia e i mercati europei."
572 //                + " Sono nato nel 200 S.p.A."
573 //                + " Il mio indirizzo e-mail è alessio@apnetwork.it."
574 //                + " Il blog è http://www.ziorufus.it e mi piace molto.";
575 //        text = "Questo è un test per una sigla qualsiasi tipo a.B.C. che non ha senso.";
576 //        text = "Milano (/miˈlano/ ascolta[?·info], in milanese: Milan[4], /miˈlãː/[5]) è una città italiana di 1 346 153 abitanti[2], capoluogo dell'omonima città metropolitana e della regione Lombardia, secondo comune italiano per numero di abitanti, tredicesimo comune dell'Unione europea e diciannovesimo del continente e, con l'agglomerato urbano, quarta area metropolitana più popolata d'Europa dopo Londra, Madrid e Parigi[6].\n"
577 //                + "\n"
578 //                + "Fondata dagli Insubri all'inizio del VI secolo a.C.[7], fu conquistata dai Romani nel 222 a.C.";
579 
580 //        System.out.println(text);
581 
582         long time = System.currentTimeMillis();
583         List<List<CoreLabel>> sentences = tokenizer.parse(text, true, true, true);
584         time = System.currentTimeMillis() - time;
585 
586         for (int i = 0; i < Math.min(10, sentences.size()); i++) {
587             List<CoreLabel> sentence = sentences.get(i);
588             for (CoreLabel token : sentence) {
589                 System.out.println(token.word() + " -- " + token.originalText() + " -- " + token.beginPosition());
590 
591             }
592             System.out.println();
593         }
594 
595         int sentenceSize = sentences.size();
596         int lastTokenIndex = sentences.get(sentenceSize - 1).get(sentences.get(sentenceSize - 1).size() - 1).index();
597         System.out.println("Length: " + text.length());
598         System.out.println("Time: " + time);
599         System.out.println("Sentences: " + sentenceSize);
600         System.out.println("Tokens: " + lastTokenIndex);
601     }
602 }