1   package eu.fbk.dh.tint.tokenizer.util;
2   
3   import com.google.common.base.Charsets;
4   import edu.stanford.nlp.ling.CoreAnnotations;
5   import edu.stanford.nlp.pipeline.Annotation;
6   import edu.stanford.nlp.pipeline.StanfordCoreNLP;
7   import edu.stanford.nlp.util.CoreMap;
8   import eu.fbk.utils.core.CommandLine;
9   import org.slf4j.Logger;
10  import org.slf4j.LoggerFactory;
11  
12  import java.io.BufferedWriter;
13  import java.io.File;
14  import java.io.FileWriter;
15  import java.nio.file.Files;
16  import java.util.List;
17  import java.util.Properties;
18  
19  /**
20   * Created by alessio on 22/07/16.
21   */
22  
23  public class SplitSentences {
24  
25      private static final Logger LOGGER = LoggerFactory.getLogger(SplitSentences.class);
26  
27      public static void main(String[] args) {
28          try {
29              final CommandLine cmd = CommandLine
30                      .parser()
31                      .withName("./annotate-sentences")
32                      .withHeader("Annotate sentences")
33                      .withOption("i", "input", "Input file", "FILE",
34                              CommandLine.Type.FILE_EXISTING, true, false, true)
35                      .withOption("o", "output", "Output file", "FILE",
36                              CommandLine.Type.FILE_EXISTING, true, false, true)
37                      .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
38  
39              File input = cmd.getOptionValue("input", File.class);
40              File output = cmd.getOptionValue("output", File.class);
41  
42              String text = new String(Files.readAllBytes(input.toPath()), Charsets.UTF_8);
43              BufferedWriter writer = new BufferedWriter(new FileWriter(output));
44  
45              Properties props = new Properties();
46              props.setProperty("annotators", "ita_toksent");
47              props.setProperty("customAnnotatorClass.ita_toksent",
48                      "eu.fbk.dh.tint.tokenizer.annotators.ItalianTokenizerAnnotator");
49  
50              StanfordCoreNLP ITApipeline = new StanfordCoreNLP(props);
51              Annotation annotation = new Annotation(text);
52              ITApipeline.annotate(annotation);
53  
54              List<CoreMap> sents = annotation.get(CoreAnnotations.SentencesAnnotation.class);
55              for (CoreMap thisSent : sents) {
56                  writer.append(thisSent.get(CoreAnnotations.TextAnnotation.class)).append("\n");
57              }
58  
59              writer.close();
60  
61          } catch (Exception e) {
62              CommandLine.fail(e);
63          }
64      }
65  }