1   package eu.fbk.dh.tint.tokenizer.annotators;
2   
3   import edu.stanford.nlp.ling.CoreAnnotation;
4   import edu.stanford.nlp.ling.CoreAnnotations;
5   import edu.stanford.nlp.ling.CoreLabel;
6   import edu.stanford.nlp.pipeline.Annotation;
7   import edu.stanford.nlp.pipeline.Annotator;
8   import edu.stanford.nlp.util.ArrayCoreMap;
9   import edu.stanford.nlp.util.CoreMap;
10  import eu.fbk.dh.tint.tokenizer.ItalianTokenizer;
11  import eu.fbk.dh.tint.tokenizer.models.ItalianTokenizerModel;
12  import eu.fbk.utils.core.PropertiesUtils;
13  import eu.fbk.utils.corenlp.Utils;
14  
15  import java.io.File;
16  import java.util.*;
17  
18  /**
19   * Created by alessio on 14/07/16.
20   */
21  
22  public class ItalianTokenizerAnnotator implements Annotator {
23  
24      boolean newlineIsSentenceBreak, tokenizeOnlyOnSpace, ssplitOnlyOnNewLine;
25      ItalianTokenizer tokenizer;
26  
27      public ItalianTokenizerAnnotator(String annotatorName, Properties props) {
28          String modelFile = props.getProperty(annotatorName + ".model", null);
29  
30          newlineIsSentenceBreak = PropertiesUtils
31                  .getBoolean(props.getProperty(annotatorName + ".newlineIsSentenceBreak"), true);
32          tokenizeOnlyOnSpace = PropertiesUtils
33                  .getBoolean(props.getProperty(annotatorName + ".tokenizeOnlyOnSpace"), false);
34          ssplitOnlyOnNewLine = PropertiesUtils
35                  .getBoolean(props.getProperty(annotatorName + ".ssplitOnlyOnNewLine"), false);
36          if (ssplitOnlyOnNewLine) {
37              newlineIsSentenceBreak = true;
38          }
39  
40          File model = null;
41          if (modelFile != null) {
42              model = new File(modelFile);
43          }
44          tokenizer = ItalianTokenizerModel.getInstance(model).getTokenizer();
45      }
46  
47      /**
48       * Given an Annotation, perform a task on this Annotation.
49       *
50       * @param annotation
51       */
52      @Override public void annotate(Annotation annotation) {
53          String text = annotation.get(CoreAnnotations.TextAnnotation.class);
54          List<List<CoreLabel>> sTokens = tokenizer
55                  .parse(text, newlineIsSentenceBreak, tokenizeOnlyOnSpace, ssplitOnlyOnNewLine);
56          Utils.addBasicAnnotations(annotation, sTokens, text);
57      }
58  
59      @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
60          return new HashSet<>(Arrays.asList(
61                  CoreAnnotations.TextAnnotation.class,
62                  CoreAnnotations.TokensAnnotation.class,
63                  CoreAnnotations.CharacterOffsetBeginAnnotation.class,
64                  CoreAnnotations.CharacterOffsetEndAnnotation.class,
65                  CoreAnnotations.BeforeAnnotation.class,
66                  CoreAnnotations.AfterAnnotation.class,
67                  CoreAnnotations.TokenBeginAnnotation.class,
68                  CoreAnnotations.TokenEndAnnotation.class,
69                  CoreAnnotations.PositionAnnotation.class,
70                  CoreAnnotations.IndexAnnotation.class,
71                  CoreAnnotations.OriginalTextAnnotation.class,
72                  CoreAnnotations.ValueAnnotation.class,
73                  CoreAnnotations.SentencesAnnotation.class,
74                  CoreAnnotations.SentenceIndexAnnotation.class
75          ));
76      }
77  
78      @Override public Set<Class<? extends CoreAnnotation>> requires() {
79          return Collections.emptySet();
80      }
81  
82  }