diff --git a/examples/naivebayes-doc-classifier.hs b/examples/naivebayes-doc-classifier.hs index f842e8c..754a134 100644 --- a/examples/naivebayes-doc-classifier.hs +++ b/examples/naivebayes-doc-classifier.hs @@ -14,26 +14,20 @@ module Main test <- readFile "examples/doc-classifier-data/data-reuters-test" classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes" + sws <- lines <$> readFile "examples/stopwords" let intClasses = [0..length classes - 1] - -- let intClasses = [0, 1] - documents = cleanDocuments $ createDocuments classes dataset - -- documents = [Document "Chinese Beijing Chinese" 0, - -- Document "Chinese Chinese Shanghai" 0, - -- Document "Chinese Macao" 0, - -- Document "Japan Tokyo Chinese" 1] - -- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0] + documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset testDocuments = cleanDocuments $ createDocuments classes test devTestDocuments = take 30 testDocuments - -- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0] nb = train documents intClasses results = map (\(Document text c) -> (c, run text nb)) testDocuments -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments - -- print (text $ head documents) + print (text $ head documents) - let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r) + let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r) mapM_ showResults results putStrLn $ "Recall: " ++ show (recall results) diff --git a/examples/stopwords b/examples/stopwords new file mode 100644 index 0000000..87490d1 --- /dev/null +++ b/examples/stopwords @@ -0,0 +1,665 @@ +a +able +about +above +abst +accordance +according +accordingly +across +act +actually +added +adj +affected +affecting +affects +after +afterwards +again +against +ah +all +almost +alone +along +already +also +although +always +am +among +amongst +an +and +announce +another +any +anybody +anyhow +anymore +anyone +anything +anyway +anyways +anywhere +apparently +approximately +are +aren +arent +arise +around +as +aside +ask +asking +at +auth +available +away +awfully +b +back +be +became +because +become +becomes +becoming +been +before +beforehand +begin +beginning +beginnings +begins +behind +being +believe +below +beside +besides +between +beyond +biol +both +brief +briefly +but +by +c +ca +came +can +cannot +can't +cause +causes +certain +certainly +co +com +come +comes +contain +containing +contains +could +couldnt +d +date +did +didn't +different +do +does +doesn't +doing +done +don't +down +downwards +due +during +e +each +ed +edu +effect +eg +eight +eighty +either +else +elsewhere +end +ending +enough +especially +et +et-al +etc +even +ever +every +everybody +everyone +everything +everywhere +ex +except +f +far +few +ff +fifth +first +five +fix +followed +following +follows +for +former +formerly +forth +found +four +from +further +furthermore +g +gave +get +gets +getting +give +given +gives +giving +go +goes +gone +got +gotten +h +had +happens +hardly +has +hasn't +have +haven't +having +he +hed +hence +her +here +hereafter +hereby +herein +heres +hereupon +hers +herself +hes +hi +hid +him +himself +his +hither +home +how +howbeit +however +hundred +i +id +ie +if +i'll +im +immediate +immediately +importance +important +in +inc +indeed +index +information +instead +into +invention +inward +is +isn't +it +itd +it'll +its +itself +i've +j +just +k +keep keeps +kept +kg +km +know +known +knows +l +largely +last +lately +later +latter +latterly +least +less +lest +let +lets +like +liked +likely +line +little +'ll +look +looking +looks +ltd +m +made +mainly +make +makes +many +may +maybe +me +mean +means +meantime +meanwhile +merely +mg +might +million +miss +ml +more +moreover +most +mostly +mr +mrs +much +mug +must +my +myself +n +na +name +namely +nay +nd +near +nearly +necessarily +necessary +need +needs +neither +never +nevertheless +new +next +nine +ninety +no +nobody +non +none +nonetheless +noone +nor +normally +nos +not +noted +nothing +now +nowhere +o +obtain +obtained +obviously +of +off +often +oh +ok +okay +old +omitted +on +once +one +ones +only +onto +or +ord +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +owing +own +p +page +pages +part +particular +particularly +past +per +perhaps +placed +please +plus +poorly +possible +possibly +potentially +pp +predominantly +present +previously +primarily +probably +promptly +proud +provides +put +q +que +quickly +quite +qv +r +ran +rather +rd +re +readily +really +recent +recently +ref +refs +regarding +regardless +regards +related +relatively +research +respectively +resulted +resulting +results +right +run +s +said +same +saw +say +saying +says +sec +section +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sent +seven +several +shall +she +shed +she'll +shes +should +shouldn't +show +showed +shown +showns +shows +significant +significantly +similar +similarly +since +six +slightly +so +some +somebody +somehow +someone +somethan +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specifically +specified +specify +specifying +still +stop +strongly +sub +substantially +successfully +such +sufficiently +suggest +sup +sure t +take +taken +taking +tell +tends +th +than +thank +thanks +thanx +that +that'll +thats +that've +the +their +theirs +them +themselves +then +thence +there +thereafter +thereby +thered +therefore +therein +there'll +thereof +therere +theres +thereto +thereupon +there've +these +they +theyd +they'll +theyre +they've +think +this +those +thou +though +thoughh +thousand +throug +through +throughout +thru +thus +til +tip +to +together +too +took +toward +towards +tried +tries +truly +try +trying +ts +twice +two +u +un +under +unfortunately +unless +unlike +unlikely +until +unto +up +upon +ups +us +use +used +useful +usefully +usefulness +uses +using +usually +v +value +various +'ve +very +via +viz +vol +vols +vs +w +want +wants +was +wasnt +way +we +wed +welcome +we'll +went +were +werent +we've +what +whatever +what'll +whats +when +whence +whenever +where +whereafter +whereas +whereby +wherein +wheres +whereupon +wherever +whether +which +while +whim +whither +who +whod +whoever +whole +who'll +whom +whomever +whos +whose +why +widely +willing +wish +with +within +without +wont +words +world +would +wouldnt +www +x +y +yes +yet +you +youd +you'll +your +youre +yours +yourself +yourselves +you've +z +zero diff --git a/src/Sibe/NaiveBayes.hs b/src/Sibe/NaiveBayes.hs index 1521a61..0cf17bd 100644 --- a/src/Sibe/NaiveBayes.hs +++ b/src/Sibe/NaiveBayes.hs @@ -12,6 +12,9 @@ module Sibe.NaiveBayes stdev, cleanText, cleanDocuments, + ngram, + removeWords, + removeStopwords, ) where import Data.List @@ -21,7 +24,7 @@ module Sibe.NaiveBayes import Data.Maybe import Control.Arrow ((&&&)) import Text.Regex.PCRE - import Data.Char (isSpace) + import Data.Char (isSpace, isNumber) import NLP.Stemmer type Class = Int; @@ -36,6 +39,7 @@ module Sibe.NaiveBayes , megadoc :: String , cd :: [(Class, [Document])] , cw :: [(Class, [(String, Int)])] + , cgram :: [(Class, [(String, Int)])] } deriving (Eq, Show, Read) train :: [Document] -> [Class] -> NB @@ -49,7 +53,9 @@ module Sibe.NaiveBayes cd = zip classes (map classDocs classes) -- (class, [(word, count)]) - cw = zip classes $ l (map classWordsCounts classes) + cw = zip classes $ map classWordsCounts classes + + cgram = zip classes $ map classNGramCounts classes in NB { documents = documents , classes = cls @@ -57,44 +63,60 @@ module Sibe.NaiveBayes , megadoc = megadoc , cd = cd , cw = cw + , cgram = cgram } where concatDocs = concatMap (\(Document text _) -> text ++ " ") classDocs x = filter ((==x) . c) documents - classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x) - classWords x = words (classMegadoc x) - classNGram n = ngram n . classMegadoc - classVocabulary x = ordNub (classWords x) + classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs + classWords = words . classMegadoc + classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs + classNGramWords = words . classNGram + classVocabulary = ordNub . classWords classPrior x = genericLength (classDocs x) / genericLength documents countWordInDoc d w = genericLength (filter (==w) d) - classWordsCounts x = - let voc = classVocabulary x - in zip voc $ map (countWordInDoc (classWords x)) voc + wordsCount ws voc = + zip voc $ map (countWordInDoc ws) voc + classWordsCounts x = wordsCount (classWords x) (classVocabulary x) + classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x) - ngram :: Int -> String -> [String] - ngram n text = - let ws = words text - in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws) + ngram :: Int -> [Document] -> [Document] + ngram n documents = + map (\(Document text c) -> Document (helper text) c) documents + where + helper text = + let ws = words text + pairs = zip [0..] ws + grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs + in unwords ("":grams) - run :: String -> NB -> Class - run text (NB documents classes vocabulary megadoc cd cw) = + run :: String -> NB -> (Class, Double) + run txt (NB documents classes vocabulary megadoc cd cw cgram) = let scores = map (score . fst) classes - in argmax scores + index = argmax scores + m = maximum scores + confidence = m / sum scores + in (index, 0) where score c = let prior = snd (classes !! c) - in prior * product (map (prob c) (words text)) + + -- below is the formula according to Multinominal Naive Bayes, but it seems + -- using a uniform prior probability seems to work better + -- in prior * product (map (prob c) (words txt)) + + in product (map (prob c) (words txt)) + prob c w = - let tctM = find ((==w) . fst) (snd (cw !! c)) - tct = (snd . fromJust) tctM - cvoc = (genericLength . snd) (cw !! c) + let tctM = find ((== w) . fst) (snd (cw !! c)) + tct = if isJust tctM then (snd . fromJust) tctM else 0 + cvoc = sum $ map snd (snd (cw !! c)) voc = vocabulary - in - if isJust tctM then - realToFrac (tct + 1) / realToFrac (cvoc + voc) - else - 1 / realToFrac (cvoc + voc) + gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c)) + pg = if isJust gram then (snd . fromJust) gram else 0 + -- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams + in realToFrac (tct + 1) / realToFrac (cvoc + voc) argmax :: (Ord a) => [a] -> Int argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x) @@ -113,7 +135,8 @@ module Sibe.NaiveBayes let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string) spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r'] stemmed = unwords $ map (stem Porter) (words spacify) - in stemmed + nonumber = filter (not . isNumber) stemmed + in (unwords . words) nonumber where trim = f . f where @@ -124,11 +147,21 @@ module Sibe.NaiveBayes cleanDocuments :: [Document] -> [Document] cleanDocuments documents = let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents - wc = wordCounts (concatDocs cleaned) + in cleaned + + removeWords :: [String] -> [Document] -> [Document] + removeWords ws documents = + map (\(Document text c) -> Document (rm ws text) c) documents + where + rm list text = + unwords $ filter (`notElem` list) (words text) + + removeStopwords :: Int -> [Document] -> [Document] + removeStopwords i documents = + let wc = wordCounts (concatDocs documents) wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc - stopwords = l $ map fst (take 30 wlist) - wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned - in wstopwords + stopwords = map fst (take i wlist) + in removeWords stopwords documents where vocabulary x = ordNub (words x) countWordInDoc d w = genericLength (filter (==w) d) @@ -136,9 +169,6 @@ module Sibe.NaiveBayes let voc = vocabulary x in zip voc $ map (countWordInDoc (words x)) voc - removeWords list text = - unwords $ filter (`notElem` list) (words text) - concatDocs = concatMap (\(Document text _) -> text ++ " ") l :: (Show a) => a -> a @@ -151,37 +181,38 @@ module Sibe.NaiveBayes go s (x:xs) = if x `Set.member` s then go s xs else x : go (Set.insert x s) xs - accuracy :: [(Int, Int)] -> Double + accuracy :: [(Int, (Int, Double))] -> Double accuracy results = - let correct = filter (uncurry (==)) results + let pairs = map (\(a, b) -> (a, fst b)) results + correct = filter (uncurry (==)) pairs in genericLength correct / genericLength results - recall :: [(Int, Int)] -> Double + recall :: [(Int, (Int, Double))] -> Double recall results = let classes = ordNub (map fst results) s = sum (map rec classes) / genericLength classes in s where rec a = - let t = genericLength $ filter (\(c, r) -> c == r && c == a) results - y = genericLength $ filter (\(c, r) -> c == a) results + let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results + y = genericLength $ filter (\(c, (r, _)) -> c == a) results in t / y - precision :: [(Int, Int)] -> Double + precision :: [(Int, (Int, Double))] -> Double precision results = let classes = ordNub (map fst results) s = sum (map prec classes) / genericLength classes in s where prec a = - let t = genericLength $ filter (\(c, r) -> c == r && c == a) results - y = genericLength $ filter (\(c, r) -> r == a) results + let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results + y = genericLength $ filter (\(c, (r, _)) -> r == a) results in if y == 0 then 0 else t / y - fmeasure :: [(Int, Int)] -> Double + fmeasure :: [(Int, (Int, Double))] -> Double fmeasure results = let r = recall results p = precision results