diff --git a/README.md b/README.md index fa44744..8a4f5d6 100644 --- a/README.md +++ b/README.md @@ -41,8 +41,11 @@ stack exec example-xor # Naive Bayes document classifier, using Reuters dataset, achieves ~62% accuracy # using Porter stemming, stopword elimination and a few custom techniques. -# the dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...) +# The dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...) +# to workaround the imbalanced dataset problem, there is a --top-ten option which classifies only top 10 popular +# classes, with evenly split datasets (100 for each) # N-Grams don't seem to help us much here (or maybe my implementation is wrong!), using bigrams increases # accuracy, while decreasing F-Measure slightly. stack exec example-naivebayes-doc-classifier -- --verbose +stack exec example-naivebayes-doc-classifier -- --verbose --top-ten ``` diff --git a/examples/doc-classifier-data b/examples/doc-classifier-data deleted file mode 160000 index 5b069a5..0000000 --- a/examples/doc-classifier-data +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 5b069a54a6a68efee0ef4bb15c1aa56414f12c28 diff --git a/examples/naivebayes-doc-classifier.hs b/examples/naivebayes-doc-classifier.hs index a16071e..f0d4985 100644 --- a/examples/naivebayes-doc-classifier.hs +++ b/examples/naivebayes-doc-classifier.hs @@ -8,7 +8,8 @@ module Main import Debug.Trace import Data.List.Split import Control.Arrow ((&&&)) - import Control.Monad (when) + import Control.Monad (when, unless) + import Data.Function (on) import System.Environment main = do @@ -19,32 +20,47 @@ module Main classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes" sws <- lines <$> readFile "examples/stopwords" - let verbose = or [elem "-v" args, elem "--verbose" args] - when (not verbose) $ putStrLn "use --verbose to print more information" + let verbose = elem "-v" args || elem "--verbose" args + topten = elem "-10" args || elem "--top-ten" args + unless verbose $ putStrLn "use --verbose to print more information" let intClasses = [0..length classes - 1] - documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset + documents = cleanDocuments . removeWords sws $ createDocuments classes dataset testDocuments = cleanDocuments $ createDocuments classes test - devTestDocuments = take 30 testDocuments + nb = train documents intClasses - results = session testDocuments nb + -- top-ten + topClasses = take 10 . reverse $ sortBy (compare `on` (length . snd)) (cd nb) + filtered = map (\(c, ds) -> (c, take 100 ds)) topClasses + filteredClasses = map fst filtered + ttDocs = concatMap snd filtered + ttNB = train ttDocs filteredClasses + + ttTestDocuments = filter ((`elem` filteredClasses) . c) . cleanDocuments $ createDocuments classes test + + ttResults = session ttTestDocuments ttNB + normalResults = session testDocuments nb + results = if topten then ttResults else normalResults + + iClasses = if topten then filteredClasses else intClasses -- results = session devTestDocuments nb - when verbose $ print (text $ head documents) + when verbose . putStrLn $ "# Example of cleaned document:\n" ++ (show . text $ head documents) let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r) when verbose $ mapM_ showResults results - when verbose $ + when (verbose && not topten) . putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n" ++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n" ++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n" ++ "are incorrectly classified as `earn`.\n" + ++ "Try running with --top-ten to classify top 10 classes by using evenly split documents\n" let accuracies = - let as = zip intClasses $ map (\c -> filter ((==c) . fst) results) intClasses + let as = zip iClasses $ map (\c -> filter ((==c) . fst) results) iClasses av = filter (not . null . snd) as calculated = map (fst &&& accuracy . snd) av in sortBy (\(_, a) (_, b) -> b `compare` a) calculated diff --git a/examples/sentiment-analysis-data b/examples/sentiment-analysis-data deleted file mode 120000 index dbd75b3..0000000 --- a/examples/sentiment-analysis-data +++ /dev/null @@ -1 +0,0 @@ -../../sibe-repos/sentiment-analysis-data \ No newline at end of file diff --git a/examples/sentiment-analysis-data b/examples/sentiment-analysis-data new file mode 100755 index 0000000..f0a3f50 --- /dev/null +++ b/examples/sentiment-analysis-data @@ -0,0 +1,5 @@ +XSym +0040 +3666c4cacaf995ebd11ef25aab70de99 +../../sibe-repos/sentiment-analysis-data + \ No newline at end of file diff --git a/src/Sibe/NaiveBayes.hs b/src/Sibe/NaiveBayes.hs index 4d72968..7afe1f6 100644 --- a/src/Sibe/NaiveBayes.hs +++ b/src/Sibe/NaiveBayes.hs @@ -14,6 +14,7 @@ module Sibe.NaiveBayes cleanText, cleanDocuments, ngram, + ngramText, removeWords, removeStopwords, ) @@ -84,13 +85,14 @@ module Sibe.NaiveBayes ngram :: Int -> [Document] -> [Document] ngram n documents = - map (\(Document text c) -> Document (helper text) c) documents - where - helper text = - let ws = words text - pairs = zip [0..] ws - grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs - in unwords ("":grams) + map (\(Document text c) -> Document (ngramText n text) c) documents + + ngramText :: Int -> String -> String + ngramText n text = + let ws = words text + pairs = zip [0..] ws + grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs + in unwords ("_":grams) session :: [Document] -> NB -> [(Class, (Class, Double))] session docs nb = @@ -102,10 +104,10 @@ module Sibe.NaiveBayes let scores = map (score . fst) classes index = argmax scores m = maximum scores - in (index, m) + in (fst (classes !! index), m) where score c = - let prior = snd (classes !! c) + let prior = snd (fromJust $ find ((==c) . fst) classes) -- below is the formula according to Multinominal Naive Bayes, but it seems -- using a uniform prior probability seems to work better when working with imbalanced @@ -118,11 +120,13 @@ module Sibe.NaiveBayes in alpha * product (map (prob c) (words txt)) prob c w = - let tctM = find ((== w) . fst) (snd (cw !! c)) + let fcw = fromJust $ find ((==c) . fst) cw + fcg = fromJust $ find ((==c) . fst) cgram + tctM = find ((== w) . fst) (snd fcw) tct = if isJust tctM then (snd . fromJust) tctM else 0 - cvoc = sum $ map snd (snd (cw !! c)) + cvoc = sum $ map snd (snd fcw) voc = vocabulary - gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c)) + gram = find ((==w) . last . splitOn "_" . fst) (snd fcg) pg = if isJust gram then (snd . fromJust) gram else 0 -- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams in realToFrac (tct + 1) / realToFrac (cvoc + voc)