WikipediaのデータをLuceneのindexに入れるコード

以前書いたけどいつもjavaXMLライブラリの使い方とか忘れるので備忘録用に上げておく

import java.io.File;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;


public class CreateIndex {
  static void addXMLFile(String file) throws Exception{
    SAXParserFactory spf = SAXParserFactory.newInstance();
    SAXParser parser = spf.newSAXParser();    
    File wikiFile = new File(file);
    if(!wikiFile.exists()){
      System.err.println(wikiFile +" not exist");
      return ;
    }
    Directory dir = FSDirectory.open(new File("enwiki-index"));
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, analyzer);
    IndexWriter writer = new IndexWriter(dir , conf);
    WikiDataHandler wph = new WikiDataHandler(writer);
    parser.parse(wikiFile, 
        new WikiXMLHandler(wph));
    System.out.println("add " + wph.adddoc +" document");
    try{
      writer.close();      
    }catch (OutOfMemoryError e) {
      writer.close();
    }
  }
  public static void main(String[] args) throws Exception{
    addXMLFile(args[0]);
  }
}
import org.xml.sax.Attributes;

import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;



public class WikiXMLHandler extends DefaultHandler{
  boolean isTitle;
  boolean isText;
  StringBuilder titleBuffer;
  StringBuilder textBuffer;
  WikiDataHandler wphandler;
  public WikiXMLHandler(WikiDataHandler h) {
    isTitle = false;
    isText = false;    
    titleBuffer = new StringBuilder();
    textBuffer = new StringBuilder();
    wphandler = h;
  }

  @Override
  public void characters(char[] ch, int start, int length) throws SAXException {
    String s = new String(ch ,start , length);
    if(isTitle){
      titleBuffer.append(s);
    }else if(isText){
      textBuffer.append(s);
    }
  }
  
  @Override
  public void startElement(String uri, String localName, String qName,
      Attributes attributes) throws SAXException {
    if(qName.equals("title")){
      isTitle = true;
      titleBuffer = new StringBuilder();
    }else if(qName.equals("text")){
      isText = true;
      textBuffer = new StringBuilder();
    }
  }
  
  @Override
  public void endElement(String uri, String localName, String qName)
      throws SAXException {
    if(qName.equals("title")){
      isTitle = false;
    }else if(qName.equals("text")){
      isText = false;
      String title = titleBuffer.toString();
      String text = textBuffer.toString();
      wphandler.handle(title, text);
    }
  }
}
import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;


public class WikiDataHandler {
  int adddoc = 0;
  IndexWriter writer;
  public WikiDataHandler(IndexWriter writer) {
    this.writer = writer;
  }
  private boolean isNotMainSpaces(String title){
    String[] prefixes = {
        "Category:",
        "File:",
        "Template:",
        "Talk:",
        "Wikipedia:",
        "User:",
        "User talk:",
        "Help:",
        "Special:",
        "MediaWiki:"
    };
    for(String prefix : prefixes){
      if(title.startsWith(prefix))return true;
    }
    return false;
  }
  private boolean isRedirectPage(String text){
    return text.toLowerCase().startsWith("#redirect");
  }
  public void handle(String title , String text){
    if(isNotMainSpaces(title))return ;
    if(isRedirectPage(text))return ;
    adddoc++;
    Document doc = new Document();
    doc.add(new Field("title", title, Store.YES, Index.NO));
    doc.add(new Field("content", text, Store.NO, Index.ANALYZED));
    try{
      writer.addDocument(doc);      
    }catch (Exception e) {
      e.printStackTrace();
    }
  }
}