fix(naivebayes): fix the algorithm to make it actually work

feat(cleanDocuments): preprocess documents, use stemming and stopword elimination for better accuracy
This commit is contained in:
Mahdi Dibaiee
2016-08-05 23:54:36 +04:30
parent 3cf0625794
commit ea1f05f001
10 changed files with 254 additions and 54 deletions

View File

@ -17,8 +17,10 @@ module Sibe
train,
session,
shuffle,
logistic,
logistic',
sigmoid,
sigmoid',
relu,
relu',
crossEntropy,
genSeed,
replaceVector
@ -88,11 +90,17 @@ module Sibe
randomLayer seed (input, h) a :-
randomNetwork (seed + 1) h hs output
logistic :: Vector Double -> Vector Double
logistic x = 1 / (1 + exp (-x))
sigmoid :: Vector Double -> Vector Double
sigmoid x = 1 / max (1 + exp (-x)) 1e-10
logistic' :: Vector Double -> Vector Double
logistic' x = logistic x * (1 - logistic x)
sigmoid' :: Vector Double -> Vector Double
sigmoid' x = sigmoid x * (1 - sigmoid x)
relu :: Vector Double -> Vector Double
relu x = log (max (1 + exp x) 1e-10)
relu' :: Vector Double -> Vector Double
relu' = sigmoid
crossEntropy :: Output -> Output -> Double
crossEntropy output target =
@ -100,7 +108,7 @@ module Sibe
n = fromIntegral (length pairs)
in (-1 / n) * sum (map f pairs)
where
f (a, y) = y * log a + (1 - y) * log (1 - a)
f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10)
train :: Input
-> Network
@ -114,8 +122,8 @@ module Sibe
let y = runLayer input l
o = fn y
delta = o - target
-- de = delta * fn' y -- quadratic cost
de = delta -- cross entropy cost
de = delta * fn' y
-- de = delta -- cross entropy cost
biases' = biases - scale alpha de
weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
@ -174,3 +182,6 @@ module Sibe
rrow index (x:xs)
| index == index = value:xs
| otherwise = x : rrow (index + 1) xs
clip :: Double -> (Double, Double) -> Double
clip x (l, u) = min u (max l x)

View File

@ -1,15 +1,17 @@
module Sibe.NaiveBayes
(Document(..),
NB(..),
createDocuments,
initialize,
calculate,
determine,
train,
run,
ordNub,
accuracy,
precision,
recall,
fmeasure,
mean,
stdev,
cleanText,
cleanDocuments,
)
where
import Data.List
@ -18,47 +20,126 @@ module Sibe.NaiveBayes
import Data.List.Split
import Data.Maybe
import Control.Arrow ((&&&))
type Class = Int
import Text.Regex.PCRE
import Data.Char (isSpace)
import NLP.Stemmer
type Class = Int;
data Document = Document { text :: String
, c :: Class
} deriving (Eq, Show, Read)
data NB = NB { vocabulary :: Double
data NB = NB { documents :: [Document]
, classes :: [(Class, Double)]
, vocabulary :: Int
, megadoc :: String
}
, cd :: [(Class, [Document])]
, cw :: [(Class, [(String, Int)])]
} deriving (Eq, Show, Read)
initialize :: [Document] -> NB
initialize documents =
let megadoc = concatMap (\(Document text _) -> text ++ " ") documents
train :: [Document] -> [Class] -> NB
train documents classes =
let megadoc = concatDocs documents
vocabulary = genericLength ((ordNub . words) megadoc)
in NB vocabulary megadoc
-- (class, prior probability)
cls = zip classes (map classPrior classes)
determine :: String -> NB -> [Class] -> [Document] -> Class
determine text nb classes documents =
let scores = zip [0..] (map (\cls -> calculate text nb cls documents) classes)
m = maximumBy (\(i0, c0) (i1, c1) -> c0 `compare` c1) scores
in fst m
-- (class, [document])
cd = zip classes (map classDocs classes)
calculate :: String -> NB -> Class -> [Document] -> Double
calculate text (NB vocabulary megadoc) cls documents =
let docs = filter (\(Document text c) -> c == cls) documents
texts = map (\(Document text _) -> text ++ " ") docs
classText = concat texts
classWords = words classText
c = genericLength classWords
pc = genericLength docs / genericLength documents
in pc * product (map (cword classWords c) (words text))
-- (class, [(word, count)])
cw = zip classes $ l (map classWordsCounts classes)
in NB { documents = documents
, classes = cls
, vocabulary = vocabulary
, megadoc = megadoc
, cd = cd
, cw = cw
}
where
cword classWords c word =
let wc = genericLength (filter (==word) classWords)
in (wc + 1) / (c + vocabulary)
concatDocs = concatMap (\(Document text _) -> text ++ " ")
createDocuments classes content =
let splitted = splitOn (replicate 10 '-' ++ "\n") content
pairs = map ((head . lines) &&& (concat . tail . lines)) splitted
documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
in documents
classDocs x = filter ((==x) . c) documents
classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
classWords x = words (classMegadoc x)
classNGram n = ngram n . classMegadoc
classVocabulary x = ordNub (classWords x)
classPrior x = genericLength (classDocs x) / genericLength documents
countWordInDoc d w = genericLength (filter (==w) d)
classWordsCounts x =
let voc = classVocabulary x
in zip voc $ map (countWordInDoc (classWords x)) voc
ngram :: Int -> String -> [String]
ngram n text =
let ws = words text
in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
run :: String -> NB -> Class
run text (NB documents classes vocabulary megadoc cd cw) =
let scores = map (score . fst) classes
in argmax scores
where
score c =
let prior = snd (classes !! c)
in prior * product (map (prob c) (words text))
prob c w =
let tctM = find ((==w) . fst) (snd (cw !! c))
tct = (snd . fromJust) tctM
cvoc = (genericLength . snd) (cw !! c)
voc = vocabulary
in
if isJust tctM then
realToFrac (tct + 1) / realToFrac (cvoc + voc)
else
1 / realToFrac (cvoc + voc)
argmax :: (Ord a) => [a] -> Int
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
mean :: [Double] -> Double
mean x = sum x / genericLength x
stdev :: [Double] -> Double
stdev x =
let avg = mean x
variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
in sqrt variance
cleanText :: String -> String
cleanText string =
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
stemmed = unwords $ map (stem Porter) (words spacify)
in stemmed
where
trim = f . f
where
f = reverse . dropWhile isSpace
replace needle replacement =
map (\c -> if c == needle then replacement else c)
cleanDocuments :: [Document] -> [Document]
cleanDocuments documents =
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
wc = wordCounts (concatDocs cleaned)
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
stopwords = l $ map fst (take 30 wlist)
wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
in wstopwords
where
vocabulary x = ordNub (words x)
countWordInDoc d w = genericLength (filter (==w) d)
wordCounts x =
let voc = vocabulary x
in zip voc $ map (countWordInDoc (words x)) voc
removeWords list text =
unwords $ filter (`notElem` list) (words text)
concatDocs = concatMap (\(Document text _) -> text ++ " ")
l :: (Show a) => a -> a
l a = trace (show a) a
@ -100,5 +181,8 @@ module Sibe.NaiveBayes
then 0
else t / y
fmeasure :: Double -> Double -> Double
fmeasure r p = (2 * p * r) / (p + r)
fmeasure :: [(Int, Int)] -> Double
fmeasure results =
let r = recall results
p = precision results
in (2 * p * r) / (p + r)