fix(cleanText): remove unnecessary spaces
fix(run): use `1 - prior` for alpha, no need for smoothing feat(cleanText): turn all text to lowercase
This commit is contained in:
parent
eebf5e0222
commit
b2888417bb
@ -36,6 +36,13 @@ module Main where
|
|||||||
|
|
||||||
See other examples:
|
See other examples:
|
||||||
```
|
```
|
||||||
|
# Simplest case of a neural network
|
||||||
stack exec example-xor
|
stack exec example-xor
|
||||||
stack exec example-naivebayes-doc-classifier
|
|
||||||
|
# Naive Bayes document classifier, using Reuters dataset, achieves ~62% accuracy
|
||||||
|
# using Porter stemming, stopword elimination and a few custom techniques.
|
||||||
|
# the dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...)
|
||||||
|
# N-Grams don't seem to help us much here (or maybe my implementation is wrong!), using bigrams increases
|
||||||
|
# accuracy, while decreasing F-Measure slightly.
|
||||||
|
stack exec example-naivebayes-doc-classifier -- --verbose
|
||||||
```
|
```
|
||||||
|
@ -25,7 +25,7 @@ module Sibe.NaiveBayes
|
|||||||
import Data.Maybe
|
import Data.Maybe
|
||||||
import Control.Arrow ((&&&))
|
import Control.Arrow ((&&&))
|
||||||
import Text.Regex.PCRE
|
import Text.Regex.PCRE
|
||||||
import Data.Char (isSpace, isNumber)
|
import Data.Char (isSpace, isNumber, toLower)
|
||||||
import NLP.Stemmer
|
import NLP.Stemmer
|
||||||
|
|
||||||
type Class = Int;
|
type Class = Int;
|
||||||
@ -110,10 +110,10 @@ module Sibe.NaiveBayes
|
|||||||
-- below is the formula according to Multinominal Naive Bayes, but it seems
|
-- below is the formula according to Multinominal Naive Bayes, but it seems
|
||||||
-- using a uniform prior probability seems to work better when working with imbalanced
|
-- using a uniform prior probability seems to work better when working with imbalanced
|
||||||
-- training datasets, instead, we help rare classes get higher scores using
|
-- training datasets, instead, we help rare classes get higher scores using
|
||||||
-- alpha = (1 - prior * ALPHA)
|
-- alpha = (1 - prior * ALPHA), we use ALPHA = 1 here
|
||||||
-- in prior * product (map (prob c) (words txt))
|
-- in prior * product (map (prob c) (words txt))
|
||||||
|
|
||||||
alpha = 1 - (log 1 + prior)
|
alpha = 1 - prior
|
||||||
|
|
||||||
in alpha * product (map (prob c) (words txt))
|
in alpha * product (map (prob c) (words txt))
|
||||||
|
|
||||||
@ -145,7 +145,8 @@ module Sibe.NaiveBayes
|
|||||||
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
||||||
stemmed = unwords $ map (stem Porter) (words spacify)
|
stemmed = unwords $ map (stem Porter) (words spacify)
|
||||||
nonumber = filter (not . isNumber) stemmed
|
nonumber = filter (not . isNumber) stemmed
|
||||||
in (unwords . words) nonumber
|
lower = map toLower nonumber
|
||||||
|
in (unwords . words) lower -- remove unnecessary spaces
|
||||||
where
|
where
|
||||||
trim = f . f
|
trim = f . f
|
||||||
where
|
where
|
||||||
|
Loading…
Reference in New Issue
Block a user