fix(cleanText): remove unnecessary spaces
fix(run): use `1 - prior` for alpha, no need for smoothing feat(cleanText): turn all text to lowercase
This commit is contained in:
parent
eebf5e0222
commit
b2888417bb
@ -36,6 +36,13 @@ module Main where
|
||||
|
||||
See other examples:
|
||||
```
|
||||
# Simplest case of a neural network
|
||||
stack exec example-xor
|
||||
stack exec example-naivebayes-doc-classifier
|
||||
|
||||
# Naive Bayes document classifier, using Reuters dataset, achieves ~62% accuracy
|
||||
# using Porter stemming, stopword elimination and a few custom techniques.
|
||||
# the dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...)
|
||||
# N-Grams don't seem to help us much here (or maybe my implementation is wrong!), using bigrams increases
|
||||
# accuracy, while decreasing F-Measure slightly.
|
||||
stack exec example-naivebayes-doc-classifier -- --verbose
|
||||
```
|
||||
|
@ -25,7 +25,7 @@ module Sibe.NaiveBayes
|
||||
import Data.Maybe
|
||||
import Control.Arrow ((&&&))
|
||||
import Text.Regex.PCRE
|
||||
import Data.Char (isSpace, isNumber)
|
||||
import Data.Char (isSpace, isNumber, toLower)
|
||||
import NLP.Stemmer
|
||||
|
||||
type Class = Int;
|
||||
@ -110,10 +110,10 @@ module Sibe.NaiveBayes
|
||||
-- below is the formula according to Multinominal Naive Bayes, but it seems
|
||||
-- using a uniform prior probability seems to work better when working with imbalanced
|
||||
-- training datasets, instead, we help rare classes get higher scores using
|
||||
-- alpha = (1 - prior * ALPHA)
|
||||
-- alpha = (1 - prior * ALPHA), we use ALPHA = 1 here
|
||||
-- in prior * product (map (prob c) (words txt))
|
||||
|
||||
alpha = 1 - (log 1 + prior)
|
||||
alpha = 1 - prior
|
||||
|
||||
in alpha * product (map (prob c) (words txt))
|
||||
|
||||
@ -145,7 +145,8 @@ module Sibe.NaiveBayes
|
||||
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
||||
stemmed = unwords $ map (stem Porter) (words spacify)
|
||||
nonumber = filter (not . isNumber) stemmed
|
||||
in (unwords . words) nonumber
|
||||
lower = map toLower nonumber
|
||||
in (unwords . words) lower -- remove unnecessary spaces
|
||||
where
|
||||
trim = f . f
|
||||
where
|
||||
|
Loading…
Reference in New Issue
Block a user