1   package eu.fbk.dh.tint.resources.ner;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.io.Files;
5   import eu.fbk.twm.index.FormPageSearcher;
6   import eu.fbk.twm.index.PageFormSearcher;
7   import eu.fbk.twm.index.util.FreqSetSearcher;
8   import eu.fbk.utils.core.CommandLine;
9   import org.slf4j.LoggerFactory;
10  
11  import java.io.BufferedWriter;
12  import java.io.File;
13  import java.io.FileWriter;
14  import java.util.HashSet;
15  import java.util.List;
16  
17  /**
18   * Created by alessio on 12/05/16.
19   */
20  
21  public class LoadWikipedia {
22  
23      private static final double MIN_PF_FREQ = 0.02;
24      private static final double MIN_FREQ = 0.8;
25  
26      public static void main(String[] args) {
27          final CommandLine cmd = CommandLine
28                  .parser()
29                  .withName("ner-extractor")
30                  .withHeader("Extractor for PER/ORG/LOC")
31                  .withOption("f", "form-page-path", "Form-page path from Airpedia", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true,
32                          false, true)
33                  .withOption("p", "page-form-path", "Page-form path from Airpedia", "DIR", CommandLine.Type.DIRECTORY_EXISTING, true,
34                          false, true)
35                  .withOption("l", "page-list", "Page-list from Airpedia", "FILE", CommandLine.Type.FILE_EXISTING, true, false, true)
36                  .withOption("o", "output", "Output file", "FILE", CommandLine.Type.FILE, true, false, true)
37                  .withOption(null, "label", "Label (PER, ORG, LOC, ...)", "LABEL", CommandLine.Type.STRING, true, false, true)
38                  .withLogger(LoggerFactory.getLogger("eu.fbk.fssa")).parse(args);
39  
40          final File formPagePath = cmd.getOptionValue("f", File.class);
41          final File pageFormPath = cmd.getOptionValue("p", File.class);
42          final File listPath = cmd.getOptionValue("l", File.class);
43          final File outputPath = cmd.getOptionValue("o", File.class);
44          final String label = cmd.getOptionValue("label", String.class);
45  
46          try {
47              FormPageSearcher formPageSearcher = new FormPageSearcher(formPagePath.getAbsolutePath());
48              PageFormSearcher pageFormSearcher = new PageFormSearcher(pageFormPath.getAbsolutePath());
49  
50              BufferedWriter writer = new BufferedWriter(new FileWriter(outputPath));
51  
52              List<String> pages = Files.readLines(listPath, Charsets.UTF_8);
53              HashSet<String> pageSet = new HashSet<>();
54              pageSet.addAll(pages);
55  
56              for (String page : pages) {
57                  page = page.trim();
58                  if (page.length() == 0) {
59                      continue;
60                  }
61  
62                  FreqSetSearcher.Entry[] entries = pageFormSearcher.search(page);
63  
64                  for (FreqSetSearcher.Entry entry : entries) {
65                      if (entry.getFreq() < MIN_PF_FREQ) {
66                          continue;
67                      }
68  
69                      String form = entry.getValue();
70                      FreqSetSearcher.Entry[] pEntries = formPageSearcher.search(form);
71  
72                      double isThis = 0;
73  
74                      for (FreqSetSearcher.Entry pEntry : pEntries) {
75                          if (pageSet.contains(pEntry.getValue())) {
76                              isThis += pEntry.getFreq();
77                          }
78                      }
79  
80                      if (isThis < MIN_FREQ) {
81                          continue;
82                      }
83  
84                      writer.append(label).append(" ").append(form).append("\n");
85                  }
86              }
87  
88              formPageSearcher.close();
89              pageFormSearcher.close();
90              writer.close();
91  
92          } catch (Exception e) {
93              e.printStackTrace();
94          }
95      }
96  }