fix(cleanText): remove unnecessary spaces

fix(run): use `1 - prior` for alpha, no need for smoothing
feat(cleanText): turn all text to lowercase
This commit is contained in:
Mahdi Dibaiee 2016-08-09 16:04:57 +04:30
parent eebf5e0222
commit b2888417bb
2 changed files with 13 additions and 5 deletions

View File

@ -36,6 +36,13 @@ module Main where
See other examples:
```
# Simplest case of a neural network
stack exec example-xor
stack exec example-naivebayes-doc-classifier
# Naive Bayes document classifier, using Reuters dataset, achieves ~62% accuracy
# using Porter stemming, stopword elimination and a few custom techniques.
# the dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...)
# N-Grams don't seem to help us much here (or maybe my implementation is wrong!), using bigrams increases
# accuracy, while decreasing F-Measure slightly.
stack exec example-naivebayes-doc-classifier -- --verbose
```

View File

@ -25,7 +25,7 @@ module Sibe.NaiveBayes
import Data.Maybe
import Control.Arrow ((&&&))
import Text.Regex.PCRE
import Data.Char (isSpace, isNumber)
import Data.Char (isSpace, isNumber, toLower)
import NLP.Stemmer
type Class = Int;
@ -110,10 +110,10 @@ module Sibe.NaiveBayes
-- below is the formula according to Multinominal Naive Bayes, but it seems
-- using a uniform prior probability seems to work better when working with imbalanced
-- training datasets, instead, we help rare classes get higher scores using
-- alpha = (1 - prior * ALPHA)
-- alpha = (1 - prior * ALPHA), we use ALPHA = 1 here
-- in prior * product (map (prob c) (words txt))
alpha = 1 - (log 1 + prior)
alpha = 1 - prior
in alpha * product (map (prob c) (words txt))
@ -145,7 +145,8 @@ module Sibe.NaiveBayes
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
stemmed = unwords $ map (stem Porter) (words spacify)
nonumber = filter (not . isNumber) stemmed
in (unwords . words) nonumber
lower = map toLower nonumber
in (unwords . words) lower -- remove unnecessary spaces
where
trim = f . f
where