feat(stopwords): removeWords and removeStopwords functions as pre-processors
feat(confidence, WIP): calculate confidence of each classification
This commit is contained in:
@ -12,6 +12,9 @@ module Sibe.NaiveBayes
|
||||
stdev,
|
||||
cleanText,
|
||||
cleanDocuments,
|
||||
ngram,
|
||||
removeWords,
|
||||
removeStopwords,
|
||||
)
|
||||
where
|
||||
import Data.List
|
||||
@ -21,7 +24,7 @@ module Sibe.NaiveBayes
|
||||
import Data.Maybe
|
||||
import Control.Arrow ((&&&))
|
||||
import Text.Regex.PCRE
|
||||
import Data.Char (isSpace)
|
||||
import Data.Char (isSpace, isNumber)
|
||||
import NLP.Stemmer
|
||||
|
||||
type Class = Int;
|
||||
@ -36,6 +39,7 @@ module Sibe.NaiveBayes
|
||||
, megadoc :: String
|
||||
, cd :: [(Class, [Document])]
|
||||
, cw :: [(Class, [(String, Int)])]
|
||||
, cgram :: [(Class, [(String, Int)])]
|
||||
} deriving (Eq, Show, Read)
|
||||
|
||||
train :: [Document] -> [Class] -> NB
|
||||
@ -49,7 +53,9 @@ module Sibe.NaiveBayes
|
||||
cd = zip classes (map classDocs classes)
|
||||
|
||||
-- (class, [(word, count)])
|
||||
cw = zip classes $ l (map classWordsCounts classes)
|
||||
cw = zip classes $ map classWordsCounts classes
|
||||
|
||||
cgram = zip classes $ map classNGramCounts classes
|
||||
|
||||
in NB { documents = documents
|
||||
, classes = cls
|
||||
@ -57,44 +63,60 @@ module Sibe.NaiveBayes
|
||||
, megadoc = megadoc
|
||||
, cd = cd
|
||||
, cw = cw
|
||||
, cgram = cgram
|
||||
}
|
||||
where
|
||||
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
||||
|
||||
classDocs x = filter ((==x) . c) documents
|
||||
classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
|
||||
classWords x = words (classMegadoc x)
|
||||
classNGram n = ngram n . classMegadoc
|
||||
classVocabulary x = ordNub (classWords x)
|
||||
classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
|
||||
classWords = words . classMegadoc
|
||||
classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
|
||||
classNGramWords = words . classNGram
|
||||
classVocabulary = ordNub . classWords
|
||||
classPrior x = genericLength (classDocs x) / genericLength documents
|
||||
countWordInDoc d w = genericLength (filter (==w) d)
|
||||
classWordsCounts x =
|
||||
let voc = classVocabulary x
|
||||
in zip voc $ map (countWordInDoc (classWords x)) voc
|
||||
wordsCount ws voc =
|
||||
zip voc $ map (countWordInDoc ws) voc
|
||||
classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
|
||||
classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
|
||||
|
||||
ngram :: Int -> String -> [String]
|
||||
ngram n text =
|
||||
let ws = words text
|
||||
in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
|
||||
ngram :: Int -> [Document] -> [Document]
|
||||
ngram n documents =
|
||||
map (\(Document text c) -> Document (helper text) c) documents
|
||||
where
|
||||
helper text =
|
||||
let ws = words text
|
||||
pairs = zip [0..] ws
|
||||
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
|
||||
in unwords ("<b>":grams)
|
||||
|
||||
run :: String -> NB -> Class
|
||||
run text (NB documents classes vocabulary megadoc cd cw) =
|
||||
run :: String -> NB -> (Class, Double)
|
||||
run txt (NB documents classes vocabulary megadoc cd cw cgram) =
|
||||
let scores = map (score . fst) classes
|
||||
in argmax scores
|
||||
index = argmax scores
|
||||
m = maximum scores
|
||||
confidence = m / sum scores
|
||||
in (index, 0)
|
||||
where
|
||||
score c =
|
||||
let prior = snd (classes !! c)
|
||||
in prior * product (map (prob c) (words text))
|
||||
|
||||
-- below is the formula according to Multinominal Naive Bayes, but it seems
|
||||
-- using a uniform prior probability seems to work better
|
||||
-- in prior * product (map (prob c) (words txt))
|
||||
|
||||
in product (map (prob c) (words txt))
|
||||
|
||||
prob c w =
|
||||
let tctM = find ((==w) . fst) (snd (cw !! c))
|
||||
tct = (snd . fromJust) tctM
|
||||
cvoc = (genericLength . snd) (cw !! c)
|
||||
let tctM = find ((== w) . fst) (snd (cw !! c))
|
||||
tct = if isJust tctM then (snd . fromJust) tctM else 0
|
||||
cvoc = sum $ map snd (snd (cw !! c))
|
||||
voc = vocabulary
|
||||
in
|
||||
if isJust tctM then
|
||||
realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
||||
else
|
||||
1 / realToFrac (cvoc + voc)
|
||||
gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c))
|
||||
pg = if isJust gram then (snd . fromJust) gram else 0
|
||||
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
|
||||
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
||||
|
||||
argmax :: (Ord a) => [a] -> Int
|
||||
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
|
||||
@ -113,7 +135,8 @@ module Sibe.NaiveBayes
|
||||
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
|
||||
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
|
||||
stemmed = unwords $ map (stem Porter) (words spacify)
|
||||
in stemmed
|
||||
nonumber = filter (not . isNumber) stemmed
|
||||
in (unwords . words) nonumber
|
||||
where
|
||||
trim = f . f
|
||||
where
|
||||
@ -124,11 +147,21 @@ module Sibe.NaiveBayes
|
||||
cleanDocuments :: [Document] -> [Document]
|
||||
cleanDocuments documents =
|
||||
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
|
||||
wc = wordCounts (concatDocs cleaned)
|
||||
in cleaned
|
||||
|
||||
removeWords :: [String] -> [Document] -> [Document]
|
||||
removeWords ws documents =
|
||||
map (\(Document text c) -> Document (rm ws text) c) documents
|
||||
where
|
||||
rm list text =
|
||||
unwords $ filter (`notElem` list) (words text)
|
||||
|
||||
removeStopwords :: Int -> [Document] -> [Document]
|
||||
removeStopwords i documents =
|
||||
let wc = wordCounts (concatDocs documents)
|
||||
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
|
||||
stopwords = l $ map fst (take 30 wlist)
|
||||
wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
|
||||
in wstopwords
|
||||
stopwords = map fst (take i wlist)
|
||||
in removeWords stopwords documents
|
||||
where
|
||||
vocabulary x = ordNub (words x)
|
||||
countWordInDoc d w = genericLength (filter (==w) d)
|
||||
@ -136,9 +169,6 @@ module Sibe.NaiveBayes
|
||||
let voc = vocabulary x
|
||||
in zip voc $ map (countWordInDoc (words x)) voc
|
||||
|
||||
removeWords list text =
|
||||
unwords $ filter (`notElem` list) (words text)
|
||||
|
||||
concatDocs = concatMap (\(Document text _) -> text ++ " ")
|
||||
|
||||
l :: (Show a) => a -> a
|
||||
@ -151,37 +181,38 @@ module Sibe.NaiveBayes
|
||||
go s (x:xs) = if x `Set.member` s then go s xs
|
||||
else x : go (Set.insert x s) xs
|
||||
|
||||
accuracy :: [(Int, Int)] -> Double
|
||||
accuracy :: [(Int, (Int, Double))] -> Double
|
||||
accuracy results =
|
||||
let correct = filter (uncurry (==)) results
|
||||
let pairs = map (\(a, b) -> (a, fst b)) results
|
||||
correct = filter (uncurry (==)) pairs
|
||||
in genericLength correct / genericLength results
|
||||
|
||||
recall :: [(Int, Int)] -> Double
|
||||
recall :: [(Int, (Int, Double))] -> Double
|
||||
recall results =
|
||||
let classes = ordNub (map fst results)
|
||||
s = sum (map rec classes) / genericLength classes
|
||||
in s
|
||||
where
|
||||
rec a =
|
||||
let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
|
||||
y = genericLength $ filter (\(c, r) -> c == a) results
|
||||
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
|
||||
y = genericLength $ filter (\(c, (r, _)) -> c == a) results
|
||||
in t / y
|
||||
|
||||
precision :: [(Int, Int)] -> Double
|
||||
precision :: [(Int, (Int, Double))] -> Double
|
||||
precision results =
|
||||
let classes = ordNub (map fst results)
|
||||
s = sum (map prec classes) / genericLength classes
|
||||
in s
|
||||
where
|
||||
prec a =
|
||||
let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
|
||||
y = genericLength $ filter (\(c, r) -> r == a) results
|
||||
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
|
||||
y = genericLength $ filter (\(c, (r, _)) -> r == a) results
|
||||
in
|
||||
if y == 0
|
||||
then 0
|
||||
else t / y
|
||||
|
||||
fmeasure :: [(Int, Int)] -> Double
|
||||
fmeasure :: [(Int, (Int, Double))] -> Double
|
||||
fmeasure results =
|
||||
let r = recall results
|
||||
p = precision results
|
||||
|
Reference in New Issue
Block a user