分類器での学習と分類,そして推測

はじめに

単語ごとに区切ることができるようになったので,分類器で学習と分類をさせます.
学習させるデータは本に載っているのと同じものなので,結果も全く同じになるはず.

実行例

require 'pp'

def sample_train(classifier)
  classifier.train('Nobody owns the water.','good')
  classifier.train('the quick rabbit jumps fences', 'good')
  classifier.train('buy pharmaceuticals now', 'bad')
  classifier.train('make quick money at the online casino', 'bad')
  classifier.train('the quick brown fox jumps', 'good')
end

classifier = My::Classifier.new
sample_train(classifier)

pp classifier.feature_count
p classifier.categories
p classifier.total_count
p classifier.select_feature('quick', 'bad')
p classifier.feature_probability('quick', 'good').to_f
p classifier.probability('money', 'good').to_f

結果は以下のとおり.

{"Nobody"=>{"good"=>1},
 "owns"=>{"good"=>1},
 "the"=>{"good"=>3, "bad"=>1},
 "water"=>{"good"=>1},
 "."=>{"good"=>1},
 "quick"=>{"good"=>2, "bad"=>1},
 "rabbit"=>{"good"=>1},
 "jumps"=>{"good"=>2},
 "fences"=>{"good"=>1},
 "buy"=>{"bad"=>1},
 "pharmaceuticals"=>{"bad"=>1},
 "now"=>{"bad"=>1},
 "make"=>{"bad"=>1},
 "money"=>{"bad"=>1},
 "at"=>{"bad"=>1},
 "online"=>{"bad"=>1},
 "casino"=>{"bad"=>1},
 "brown"=>{"good"=>1},
 "fox"=>{"good"=>1}}
["good", "bad"]
5
1
0.666666666666667
0.25

よし!
ちゃんとできてるよ.

ソース

require 'word_counter'

module My
  class Classifier
    INITIAL_PROBABILITY = 0.5
    WEIGHT = 1.0

    def initialize
      @feature_count = Hash.new { |hash, key| hash[key] = Hash.new(0) }
      @category_count = Hash.new(0)
    end

    attr_reader :feature_count

    # feature_countをselect
    def select_feature(feature, category)
      @feature_count[feature][category]
    end
    
    # featureとcategoryを指定してカウントを足す
    def update_feature(feature, category, count)
      @feature_count[feature][category] += count
    end

    # category_countをselect
    def select_category(category)
      @category_count[category]
    end

    # categoryを指定してカウントを足す
    def update_category(category)
      @category_count[category] += 1
    end

    # category名を全て配列にする
    def categories
      @category_count.keys
    end
    # category_countの総計
    def total_count
      @category_count.values.inject(0) { |sum, i| sum + i }
    end

    # 事前学習 (単語登場回数をカウント)
    def train(sentence, category)
      count_result = WordCounter.new.count(sentence).result

      count_result.each do |feature, count|
        update_feature(feature, category, count)
      end
      update_category(category)
    end

    # featureがcategotyに含まれる確率を計算する
    def feature_probability(feature, category)
      denominator = select_category(category)
      denominator == 0 ? 0 : select_feature(feature, category).quo(denominator)
    end

    # 初出の単語に対する重み付けを加味して,categoryに含まれる確率を計算する
    def probability(feature, category)
      probability = feature_probability(feature, category)
      total_count = @feature_count[feature].values.inject(0) { |sum, i| sum + i }
      ((WEIGHT * INITIAL_PROBABILITY) + (total_count * probability)).quo(WEIGHT + total_count)
    end
  end
end

最後に

学習と推測までできるようになったので,次は単純ベイズ分類器を作ります.