[プログラミング]教師なし形態素解析

なんとなくid:nokunoさんが紹介してたNLTK Bookに乗ってる教師なし形態素解析を実装してみた。
http://d.hatena.ne.jp/nokuno/20100124/1264319152

切れ目の部分をflipした時にスコアをデータ全部見て再計算する代わりに、必要な部分だけ更新するようにしてみた。これでAlice's Adventures in Wonderlandぐらいのデータに対しては適応できるようになったけど、正直精度は微妙でした。

実行例

doyou
see
thekitty
see
thedoggy
doyou
like
thekitty
like
thedoggy

プログラム

package segmentation;

import java.util.*;

public class Main {
  public static void main(String[] args) throws Exception{
    String text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy";
    long ts = System.currentTimeMillis();
    Segmentator seg = new Segmentator(text , 777);
    seg.run(1000000);
    long te = System.currentTimeMillis();
    System.err.println("time = " + (te - ts) + " msec");
    List<String> list = seg.getResult();
    for(String l : list){
      System.out.println(l);
    }
  }
}
package segmentation;

import java.util.*;

public class Segmentator {
  Map<String, Integer> dict;
  int wordNum;
  int dictSize;
  String text;
  boolean[] z;
  Random rand;

  public Segmentator(String text, long seed) {
    this.text = text;
    int N = text.length();
    z = new boolean[N - 1];
    rand = new Random(seed);
    for (int i = 0; i < z.length; ++i) {
      z[i] = rand.nextBoolean();
    }
    dict = new HashMap<String, Integer>();
    init();
  }

  int count(String word) {
    if (dict.containsKey(word)) {
      return dict.get(word);
    } else {
      return 0;
    }
  }

  void add(String word) {
    if (dict.containsKey(word)) {
      dict.put(word, dict.get(word) + 1);
    } else {
      dictSize += word.length();
      dict.put(word, 1);
    }
  }

  void delete(String word) {
    int val = dict.get(word);
    if (val == 1) {
      dictSize -= word.length();
      dict.remove(word);
    } else {
      dict.put(word, val - 1);
    }
  }

  void init() {
    wordNum = 1;
    for (int i = 0; i < z.length; ++i) {
      if (z[i]) {
        ++wordNum;
      }
    }
    int prev = 0;
    dictSize = 0;
    for (int i = 0; i < z.length; ++i) {
      if (z[i]) {
        String word = text.substring(prev, i + 1);
        add(word);
        prev = i + 1;
      }
    }
    String word = text.substring(prev);
    add(word);
  }

  void flip(int index) {
    int lstart = -1;
    for (int i = index - 1; i >= 0; --i) {
      if (z[i]) {
        lstart = i;
        break;
      }
    }
    String left = text.substring(lstart + 1, index + 1);
    int rend = z.length;
    for (int i = index + 1; i < z.length; ++i) {
      if (z[i]) {
        rend = i;
        break;
      }
    }
    String right = text.substring(index + 1, rend + 1);
    if (z[index]) {
      delete(left);
      delete(right);
      add(left + right);
      wordNum--;
    } else {
      delete(left + right);
      add(left);
      add(right);
      wordNum++;
    }
    z[index] = !z[index];
  }

  boolean accept(double energy, double eneryNeighbor, double temperature) {
    if (eneryNeighbor < energy) {
      return true;
    } else {
      double r = Math.exp((energy - eneryNeighbor) * 0.5) * temperature;
      return rand.nextDouble() < r;
    }
  }

  void run(int iterMax) {
    double best = score();
    boolean zBest[] = null;
    for (int iter = 0; iter < iterMax; ++iter) {
      int ind = rand.nextInt(z.length);
      double temp = (iterMax - iter) * 1.0 / iterMax;
      double current = score();
      flip(ind);
      double next = score();
      if (!accept(current, next, temp + 1e-8)) {
        flip(ind);
      }
      double score = score();
      if (score < best) {
        best = score;
        zBest = z.clone();
      }
    }

    if (zBest != null) {
      z = zBest.clone();
    }
  }

  double score() {
    return dictSize + wordNum;
  }

  List<String> getResult() {
    List<String> list = new ArrayList<String>();
    int prev = 0;
    for (int i = 0; i < z.length; ++i) {
      if (z[i]) {
        String word = text.substring(prev, i + 1);
        list.add(word);
        prev = i + 1;
      }
    }
    String word = text.substring(prev);
    list.add(word);
    return list;
  }
}