fix(cleanText): remove unnecessary spaces

fix(run): use `1 - prior` for alpha, no need for smoothing feat(cleanText): turn all text to lowercase
2016-08-09 16:04:57 +04:30
parent eebf5e0222
commit b2888417bb
2 changed files with 13 additions and 5 deletions
--- a/README.md
+++ b/README.md
@@ -36,6 +36,13 @@ module Main where

 See other examples:
 ```
+# Simplest case of a neural network
 stack exec example-xor
-stack exec example-naivebayes-doc-classifier
+
+# Naive Bayes document classifier, using Reuters dataset, achieves ~62% accuracy
+# using Porter stemming, stopword elimination and a few custom techniques.
+# the dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...)
+# N-Grams don't seem to help us much here (or maybe my implementation is wrong!), using bigrams increases
+# accuracy, while decreasing F-Measure slightly.
+stack exec example-naivebayes-doc-classifier -- --verbose
 ```
--- a/src/Sibe/NaiveBayes.hs
+++ b/src/Sibe/NaiveBayes.hs
@@ -25,7 +25,7 @@ module Sibe.NaiveBayes
    import Data.Maybe
    import Control.Arrow ((&&&))
    import Text.Regex.PCRE
-    import Data.Char (isSpace, isNumber)
+    import Data.Char (isSpace, isNumber, toLower)
    import NLP.Stemmer

    type Class = Int;
@@ -110,10 +110,10 @@ module Sibe.NaiveBayes
          -- below is the formula according to Multinominal Naive Bayes, but it seems
          -- using a uniform prior probability seems to work better when working with imbalanced
          -- training datasets, instead, we help rare classes get higher scores using
-          -- alpha = (1 - prior * ALPHA)
+          -- alpha = (1 - prior * ALPHA), we use ALPHA = 1 here
          -- in prior * product (map (prob c) (words txt))

-              alpha = 1 - (log 1 + prior)
+              alpha = 1 - prior

          in alpha * product (map (prob c) (words txt))

@@ -145,7 +145,8 @@ module Sibe.NaiveBayes
          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
          stemmed = unwords $ map (stem Porter) (words spacify)
          nonumber = filter (not . isNumber) stemmed
-      in (unwords . words) nonumber
+          lower = map toLower nonumber
+      in (unwords . words) lower -- remove unnecessary spaces
      where
        trim = f . f
          where