WikipediaのデータをLuceneのindexに入れるコード
以前書いたけどいつもjavaのXMLライブラリの使い方とか忘れるので備忘録用に上げておく
import java.io.File; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class CreateIndex { static void addXMLFile(String file) throws Exception{ SAXParserFactory spf = SAXParserFactory.newInstance(); SAXParser parser = spf.newSAXParser(); File wikiFile = new File(file); if(!wikiFile.exists()){ System.err.println(wikiFile +" not exist"); return ; } Directory dir = FSDirectory.open(new File("enwiki-index")); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, analyzer); IndexWriter writer = new IndexWriter(dir , conf); WikiDataHandler wph = new WikiDataHandler(writer); parser.parse(wikiFile, new WikiXMLHandler(wph)); System.out.println("add " + wph.adddoc +" document"); try{ writer.close(); }catch (OutOfMemoryError e) { writer.close(); } } public static void main(String[] args) throws Exception{ addXMLFile(args[0]); } }
import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class WikiXMLHandler extends DefaultHandler{ boolean isTitle; boolean isText; StringBuilder titleBuffer; StringBuilder textBuffer; WikiDataHandler wphandler; public WikiXMLHandler(WikiDataHandler h) { isTitle = false; isText = false; titleBuffer = new StringBuilder(); textBuffer = new StringBuilder(); wphandler = h; } @Override public void characters(char[] ch, int start, int length) throws SAXException { String s = new String(ch ,start , length); if(isTitle){ titleBuffer.append(s); }else if(isText){ textBuffer.append(s); } } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if(qName.equals("title")){ isTitle = true; titleBuffer = new StringBuilder(); }else if(qName.equals("text")){ isText = true; textBuffer = new StringBuilder(); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if(qName.equals("title")){ isTitle = false; }else if(qName.equals("text")){ isText = false; String title = titleBuffer.toString(); String text = textBuffer.toString(); wphandler.handle(title, text); } } }
import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexWriter; public class WikiDataHandler { int adddoc = 0; IndexWriter writer; public WikiDataHandler(IndexWriter writer) { this.writer = writer; } private boolean isNotMainSpaces(String title){ String[] prefixes = { "Category:", "File:", "Template:", "Talk:", "Wikipedia:", "User:", "User talk:", "Help:", "Special:", "MediaWiki:" }; for(String prefix : prefixes){ if(title.startsWith(prefix))return true; } return false; } private boolean isRedirectPage(String text){ return text.toLowerCase().startsWith("#redirect"); } public void handle(String title , String text){ if(isNotMainSpaces(title))return ; if(isRedirectPage(text))return ; adddoc++; Document doc = new Document(); doc.add(new Field("title", title, Store.YES, Index.NO)); doc.add(new Field("content", text, Store.NO, Index.ANALYZED)); try{ writer.addDocument(doc); }catch (Exception e) { e.printStackTrace(); } } }