1   package eu.fbk.dh.tint.eval.morpho;
2   
3   import com.google.common.base.Charsets;
4   import edu.stanford.nlp.ling.CoreAnnotations;
5   import edu.stanford.nlp.ling.CoreLabel;
6   import edu.stanford.nlp.pipeline.Annotation;
7   import edu.stanford.nlp.pipeline.StanfordCoreNLP;
8   import edu.stanford.nlp.util.CoreMap;
9   import eu.fbk.utils.core.CommandLine;
10  import org.slf4j.Logger;
11  import org.slf4j.LoggerFactory;
12  
13  import java.io.BufferedWriter;
14  import java.io.File;
15  import java.io.FileWriter;
16  import java.nio.file.Files;
17  import java.util.List;
18  import java.util.Properties;
19  
20  /**
21   * Created by alessio on 21/07/16.
22   */
23  
24  public class AnnotateLemma {
25  
26      private static final Logger LOGGER = LoggerFactory.getLogger(AnnotateLemma.class);
27  
28      public static void main(String[] args) {
29          try {
30              final CommandLine cmd = CommandLine
31                      .parser()
32                      .withName("./annotate-lemmas")
33                      .withHeader("Annotate lemmas")
34                      .withOption("i", "input", "Input file", "FILE",
35                              CommandLine.Type.FILE_EXISTING, true, false, true)
36                      .withOption("o", "output", "Input file", "FILE",
37                              CommandLine.Type.FILE_EXISTING, true, false, true)
38                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
39  
40              File input = cmd.getOptionValue("input", File.class);
41              File output = cmd.getOptionValue("output", File.class);
42  
43              String text = new String(Files.readAllBytes(input.toPath()), Charsets.UTF_8);
44              BufferedWriter writer = new BufferedWriter(new FileWriter(output));
45  
46              Properties props = new Properties();
47              props.setProperty("annotators", "tokenize, ssplit, pos, ita_morpho, ita_lemma");
48              props.setProperty("tokenize.whitespace", "true");
49              props.setProperty("ssplit.eolonly", "true");
50  
51  //            props.setProperty("ita_toksent.newlineIsSentenceBreak", "1");
52  
53              props.setProperty("pos.model", "/Users/alessio/Documents/Resources/ita-models/italian5.tagger");
54  
55              props.setProperty("customAnnotatorClass.ita_toksent",
56                      "eu.fbk.dkm.pikes.tintop.ita.annotators.ItalianTokenizerAnnotator");
57              props.setProperty("customAnnotatorClass.ita_lemma", "eu.fbk.dh.digimorph.annotator.DigiLemmaAnnotator");
58              props.setProperty("customAnnotatorClass.ita_morpho", "eu.fbk.dh.digimorph.annotator.DigiMorphAnnotator");
59              props.setProperty("ita_morpho.model", "/Users/alessio/Documents/Resources/ita-models/italian.db");
60  
61              StanfordCoreNLP ITApipeline = new StanfordCoreNLP(props);
62              Annotation annotation = new Annotation(text);
63              ITApipeline.annotate(annotation);
64  
65              System.out.println(ITApipeline.timingInformation());
66  
67              List<CoreMap> sents = annotation.get(CoreAnnotations.SentencesAnnotation.class);
68              for (CoreMap thisSent : sents) {
69                  List<CoreLabel> tokens = thisSent.get(CoreAnnotations.TokensAnnotation.class);
70                  for (CoreLabel token : tokens) {
71                      writer.append(token.originalText().replaceAll("\\s+", ""))
72                              .append("\t")
73                              .append(token.get(CoreAnnotations.PartOfSpeechAnnotation.class))
74                              .append("\t")
75                              .append(token.get(CoreAnnotations.LemmaAnnotation.class))
76                              .append("\n");
77                  }
78                  writer.append("\n");
79              }
80  
81              writer.close();
82  
83          } catch (Exception e) {
84              CommandLine.fail(e);
85          }
86  
87      }
88  }