diff --git a/README.md b/README.md index e87d5d1..fa44744 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,13 @@ module Main where See other examples: ``` +# Simplest case of a neural network stack exec example-xor -stack exec example-naivebayes-doc-classifier + +# Naive Bayes document classifier, using Reuters dataset, achieves ~62% accuracy +# using Porter stemming, stopword elimination and a few custom techniques. +# the dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...) +# N-Grams don't seem to help us much here (or maybe my implementation is wrong!), using bigrams increases +# accuracy, while decreasing F-Measure slightly. +stack exec example-naivebayes-doc-classifier -- --verbose ``` diff --git a/src/Sibe/NaiveBayes.hs b/src/Sibe/NaiveBayes.hs index db10970..4d72968 100644 --- a/src/Sibe/NaiveBayes.hs +++ b/src/Sibe/NaiveBayes.hs @@ -25,7 +25,7 @@ module Sibe.NaiveBayes import Data.Maybe import Control.Arrow ((&&&)) import Text.Regex.PCRE - import Data.Char (isSpace, isNumber) + import Data.Char (isSpace, isNumber, toLower) import NLP.Stemmer type Class = Int; @@ -110,10 +110,10 @@ module Sibe.NaiveBayes -- below is the formula according to Multinominal Naive Bayes, but it seems -- using a uniform prior probability seems to work better when working with imbalanced -- training datasets, instead, we help rare classes get higher scores using - -- alpha = (1 - prior * ALPHA) + -- alpha = (1 - prior * ALPHA), we use ALPHA = 1 here -- in prior * product (map (prob c) (words txt)) - alpha = 1 - (log 1 + prior) + alpha = 1 - prior in alpha * product (map (prob c) (words txt)) @@ -145,7 +145,8 @@ module Sibe.NaiveBayes spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r'] stemmed = unwords $ map (stem Porter) (words spacify) nonumber = filter (not . isNumber) stemmed - in (unwords . words) nonumber + lower = map toLower nonumber + in (unwords . words) lower -- remove unnecessary spaces where trim = f . f where