feat(stopwords): removeWords and removeStopwords functions as pre-processors

feat(confidence, WIP): calculate confidence of each classification
This commit is contained in:
Mahdi Dibaiee
2016-08-08 10:02:26 +04:30
parent ea1f05f001
commit 099c25e166
3 changed files with 742 additions and 52 deletions

View File

@ -12,6 +12,9 @@ module Sibe.NaiveBayes
stdev,
cleanText,
cleanDocuments,
ngram,
removeWords,
removeStopwords,
)
where
import Data.List
@ -21,7 +24,7 @@ module Sibe.NaiveBayes
import Data.Maybe
import Control.Arrow ((&&&))
import Text.Regex.PCRE
import Data.Char (isSpace)
import Data.Char (isSpace, isNumber)
import NLP.Stemmer
type Class = Int;
@ -36,6 +39,7 @@ module Sibe.NaiveBayes
, megadoc :: String
, cd :: [(Class, [Document])]
, cw :: [(Class, [(String, Int)])]
, cgram :: [(Class, [(String, Int)])]
} deriving (Eq, Show, Read)
train :: [Document] -> [Class] -> NB
@ -49,7 +53,9 @@ module Sibe.NaiveBayes
cd = zip classes (map classDocs classes)
-- (class, [(word, count)])
cw = zip classes $ l (map classWordsCounts classes)
cw = zip classes $ map classWordsCounts classes
cgram = zip classes $ map classNGramCounts classes
in NB { documents = documents
, classes = cls
@ -57,44 +63,60 @@ module Sibe.NaiveBayes
, megadoc = megadoc
, cd = cd
, cw = cw
, cgram = cgram
}
where
concatDocs = concatMap (\(Document text _) -> text ++ " ")
classDocs x = filter ((==x) . c) documents
classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
classWords x = words (classMegadoc x)
classNGram n = ngram n . classMegadoc
classVocabulary x = ordNub (classWords x)
classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
classWords = words . classMegadoc
classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
classNGramWords = words . classNGram
classVocabulary = ordNub . classWords
classPrior x = genericLength (classDocs x) / genericLength documents
countWordInDoc d w = genericLength (filter (==w) d)
classWordsCounts x =
let voc = classVocabulary x
in zip voc $ map (countWordInDoc (classWords x)) voc
wordsCount ws voc =
zip voc $ map (countWordInDoc ws) voc
classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
ngram :: Int -> String -> [String]
ngram n text =
let ws = words text
in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
ngram :: Int -> [Document] -> [Document]
ngram n documents =
map (\(Document text c) -> Document (helper text) c) documents
where
helper text =
let ws = words text
pairs = zip [0..] ws
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
in unwords ("<b>":grams)
run :: String -> NB -> Class
run text (NB documents classes vocabulary megadoc cd cw) =
run :: String -> NB -> (Class, Double)
run txt (NB documents classes vocabulary megadoc cd cw cgram) =
let scores = map (score . fst) classes
in argmax scores
index = argmax scores
m = maximum scores
confidence = m / sum scores
in (index, 0)
where
score c =
let prior = snd (classes !! c)
in prior * product (map (prob c) (words text))
-- below is the formula according to Multinominal Naive Bayes, but it seems
-- using a uniform prior probability seems to work better
-- in prior * product (map (prob c) (words txt))
in product (map (prob c) (words txt))
prob c w =
let tctM = find ((==w) . fst) (snd (cw !! c))
tct = (snd . fromJust) tctM
cvoc = (genericLength . snd) (cw !! c)
let tctM = find ((== w) . fst) (snd (cw !! c))
tct = if isJust tctM then (snd . fromJust) tctM else 0
cvoc = sum $ map snd (snd (cw !! c))
voc = vocabulary
in
if isJust tctM then
realToFrac (tct + 1) / realToFrac (cvoc + voc)
else
1 / realToFrac (cvoc + voc)
gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c))
pg = if isJust gram then (snd . fromJust) gram else 0
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
argmax :: (Ord a) => [a] -> Int
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
@ -113,7 +135,8 @@ module Sibe.NaiveBayes
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
stemmed = unwords $ map (stem Porter) (words spacify)
in stemmed
nonumber = filter (not . isNumber) stemmed
in (unwords . words) nonumber
where
trim = f . f
where
@ -124,11 +147,21 @@ module Sibe.NaiveBayes
cleanDocuments :: [Document] -> [Document]
cleanDocuments documents =
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
wc = wordCounts (concatDocs cleaned)
in cleaned
removeWords :: [String] -> [Document] -> [Document]
removeWords ws documents =
map (\(Document text c) -> Document (rm ws text) c) documents
where
rm list text =
unwords $ filter (`notElem` list) (words text)
removeStopwords :: Int -> [Document] -> [Document]
removeStopwords i documents =
let wc = wordCounts (concatDocs documents)
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
stopwords = l $ map fst (take 30 wlist)
wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
in wstopwords
stopwords = map fst (take i wlist)
in removeWords stopwords documents
where
vocabulary x = ordNub (words x)
countWordInDoc d w = genericLength (filter (==w) d)
@ -136,9 +169,6 @@ module Sibe.NaiveBayes
let voc = vocabulary x
in zip voc $ map (countWordInDoc (words x)) voc
removeWords list text =
unwords $ filter (`notElem` list) (words text)
concatDocs = concatMap (\(Document text _) -> text ++ " ")
l :: (Show a) => a -> a
@ -151,37 +181,38 @@ module Sibe.NaiveBayes
go s (x:xs) = if x `Set.member` s then go s xs
else x : go (Set.insert x s) xs
accuracy :: [(Int, Int)] -> Double
accuracy :: [(Int, (Int, Double))] -> Double
accuracy results =
let correct = filter (uncurry (==)) results
let pairs = map (\(a, b) -> (a, fst b)) results
correct = filter (uncurry (==)) pairs
in genericLength correct / genericLength results
recall :: [(Int, Int)] -> Double
recall :: [(Int, (Int, Double))] -> Double
recall results =
let classes = ordNub (map fst results)
s = sum (map rec classes) / genericLength classes
in s
where
rec a =
let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
y = genericLength $ filter (\(c, r) -> c == a) results
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
y = genericLength $ filter (\(c, (r, _)) -> c == a) results
in t / y
precision :: [(Int, Int)] -> Double
precision :: [(Int, (Int, Double))] -> Double
precision results =
let classes = ordNub (map fst results)
s = sum (map prec classes) / genericLength classes
in s
where
prec a =
let t = genericLength $ filter (\(c, r) -> c == r && c == a) results
y = genericLength $ filter (\(c, r) -> r == a) results
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
y = genericLength $ filter (\(c, (r, _)) -> r == a) results
in
if y == 0
then 0
else t / y
fmeasure :: [(Int, Int)] -> Double
fmeasure :: [(Int, (Int, Double))] -> Double
fmeasure results =
let r = recall results
p = precision results