[プログラミング]教師なし形態素解析
なんとなくid:nokunoさんが紹介してたNLTK Bookに乗ってる教師なし形態素解析を実装してみた。
http://d.hatena.ne.jp/nokuno/20100124/1264319152
切れ目の部分をflipした時にスコアをデータ全部見て再計算する代わりに、必要な部分だけ更新するようにしてみた。これでAlice's Adventures in Wonderlandぐらいのデータに対しては適応できるようになったけど、正直精度は微妙でした。
実行例
doyou see thekitty see thedoggy doyou like thekitty like thedoggy
プログラム
package segmentation; import java.util.*; public class Main { public static void main(String[] args) throws Exception{ String text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"; long ts = System.currentTimeMillis(); Segmentator seg = new Segmentator(text , 777); seg.run(1000000); long te = System.currentTimeMillis(); System.err.println("time = " + (te - ts) + " msec"); List<String> list = seg.getResult(); for(String l : list){ System.out.println(l); } } }
package segmentation; import java.util.*; public class Segmentator { Map<String, Integer> dict; int wordNum; int dictSize; String text; boolean[] z; Random rand; public Segmentator(String text, long seed) { this.text = text; int N = text.length(); z = new boolean[N - 1]; rand = new Random(seed); for (int i = 0; i < z.length; ++i) { z[i] = rand.nextBoolean(); } dict = new HashMap<String, Integer>(); init(); } int count(String word) { if (dict.containsKey(word)) { return dict.get(word); } else { return 0; } } void add(String word) { if (dict.containsKey(word)) { dict.put(word, dict.get(word) + 1); } else { dictSize += word.length(); dict.put(word, 1); } } void delete(String word) { int val = dict.get(word); if (val == 1) { dictSize -= word.length(); dict.remove(word); } else { dict.put(word, val - 1); } } void init() { wordNum = 1; for (int i = 0; i < z.length; ++i) { if (z[i]) { ++wordNum; } } int prev = 0; dictSize = 0; for (int i = 0; i < z.length; ++i) { if (z[i]) { String word = text.substring(prev, i + 1); add(word); prev = i + 1; } } String word = text.substring(prev); add(word); } void flip(int index) { int lstart = -1; for (int i = index - 1; i >= 0; --i) { if (z[i]) { lstart = i; break; } } String left = text.substring(lstart + 1, index + 1); int rend = z.length; for (int i = index + 1; i < z.length; ++i) { if (z[i]) { rend = i; break; } } String right = text.substring(index + 1, rend + 1); if (z[index]) { delete(left); delete(right); add(left + right); wordNum--; } else { delete(left + right); add(left); add(right); wordNum++; } z[index] = !z[index]; } boolean accept(double energy, double eneryNeighbor, double temperature) { if (eneryNeighbor < energy) { return true; } else { double r = Math.exp((energy - eneryNeighbor) * 0.5) * temperature; return rand.nextDouble() < r; } } void run(int iterMax) { double best = score(); boolean zBest[] = null; for (int iter = 0; iter < iterMax; ++iter) { int ind = rand.nextInt(z.length); double temp = (iterMax - iter) * 1.0 / iterMax; double current = score(); flip(ind); double next = score(); if (!accept(current, next, temp + 1e-8)) { flip(ind); } double score = score(); if (score < best) { best = score; zBest = z.clone(); } } if (zBest != null) { z = zBest.clone(); } } double score() { return dictSize + wordNum; } List<String> getResult() { List<String> list = new ArrayList<String>(); int prev = 0; for (int i = 0; i < z.length; ++i) { if (z[i]) { String word = text.substring(prev, i + 1); list.add(word); prev = i + 1; } } String word = text.substring(prev); list.add(word); return list; } }