00001 package de.aitools.aq.invertedindex.usage;
00002
00003 import java.io.BufferedReader;
00004 import java.io.File;
00005 import java.io.FilenameFilter;
00006 import java.io.IOException;
00007 import java.io.InputStreamReader;
00008 import java.util.Calendar;
00009 import java.util.zip.ZipEntry;
00010 import java.util.zip.ZipException;
00011 import java.util.zip.ZipFile;
00012
00013 import de.aitools.aq.check.A;
00014 import de.aitools.aq.invertedindex.core.Configuration;
00015 import de.aitools.aq.invertedindex.core.Configuration.KeySorting;
00016 import de.aitools.aq.invertedindex.core.Configuration.ValueSorting;
00017 import de.aitools.aq.invertedindex.core.Indexer;
00018 import de.aitools.aq.value.pair.ShortInt;
00019
00044 public class GoogleBooks {
00045
00046 public static final void index(File ngramDir, File indexDir)
00047 throws ZipException, IOException {
00048 A.CHECK(!ngramDir.equals(indexDir));
00049
00050 Configuration config = new Configuration();
00051 config.setIndexDirectory(indexDir);
00052 config.setKeySorting(KeySorting.SORTED);
00053 config.setValueSorting(ValueSorting.DISABLED);
00054 Indexer<ShortInt> indexer = Indexer.open(ShortInt.class, config);
00055
00056 final int minNgramLength = 1;
00057 final int maxNgramLength = 5;
00058 for (int i = minNgramLength; i <= maxNgramLength; ++i) {
00059 final int curNgramLength = i;
00060 for (File file : ngramDir.listFiles(new FilenameFilter() {
00061 @Override
00062 public boolean accept(File dir, String name) {
00063 return name.contains(curNgramLength + "gram");
00064 }})) {
00065 String currentLine;
00066 ShortInt curValue = new ShortInt();
00067 ZipFile zf = new ZipFile(file);
00068 ZipEntry ze = zf.getEntry(file.getName().replace(".zip", ""));
00069 BufferedReader reader = new BufferedReader(
00070 new InputStreamReader(zf.getInputStream(ze)));
00071 System.out.print(Calendar.getInstance().getTime());
00072 System.out.println(" : Process " + ze.getName());
00073 while ((currentLine = reader.readLine()) != null) {
00074 String[] curLineToken = currentLine.split("\t");
00075 if (curLineToken.length < 3 ||
00076 curLineToken[0].split(" ").length != curNgramLength)
00077 continue;
00078 curValue.set(
00079 Short.parseShort(curLineToken[1]),
00080 Integer.parseInt(curLineToken[2]));
00081 indexer.put(curLineToken[0], curValue);
00082 }
00083 zf.close();
00084 }
00085 }
00086 indexer.index();
00087 }
00088
00089 public static void main(String[] args) {
00090
00091 final File ngramDir = new File("/mnt/nfs/webis16/storage1/corpora/" +
00092 "corpora-n-grams/corpus-google-books-ngram-viewer/" +
00093 "googlebooks-eng-1M-20090715");
00094
00095 final File indexDir = new File("/media/1TB/googlebooks-index");
00096 try {
00097 GoogleBooks.index(ngramDir, indexDir);
00098 } catch (ZipException e) {
00099 e.printStackTrace();
00100 } catch (IOException e) {
00101 e.printStackTrace();
00102 }
00103 }
00104
00105 }