diff --git a/examples/naivebayes-doc-classifier.hs b/examples/naivebayes-doc-classifier.hs index 754a134..a16071e 100644 --- a/examples/naivebayes-doc-classifier.hs +++ b/examples/naivebayes-doc-classifier.hs @@ -8,32 +8,55 @@ module Main import Debug.Trace import Data.List.Split import Control.Arrow ((&&&)) + import Control.Monad (when) + import System.Environment main = do + args <- getArgs dataset <- readFile "examples/doc-classifier-data/data-reuters" test <- readFile "examples/doc-classifier-data/data-reuters-test" classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes" sws <- lines <$> readFile "examples/stopwords" + let verbose = or [elem "-v" args, elem "--verbose" args] + when (not verbose) $ putStrLn "use --verbose to print more information" + let intClasses = [0..length classes - 1] documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset testDocuments = cleanDocuments $ createDocuments classes test devTestDocuments = take 30 testDocuments nb = train documents intClasses - results = map (\(Document text c) -> (c, run text nb)) testDocuments - -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments + results = session testDocuments nb + -- results = session devTestDocuments nb - print (text $ head documents) + when verbose $ print (text $ head documents) let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r) - mapM_ showResults results + when verbose $ mapM_ showResults results - putStrLn $ "Recall: " ++ show (recall results) - putStrLn $ "Precision: " ++ show (precision results) - putStrLn $ "F Measure: " ++ show (fmeasure results) - putStrLn $ "Accuracy: " ++ show (accuracy results) + when verbose $ + putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n" + ++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n" + ++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n" + ++ "are incorrectly classified as `earn`.\n" + + let + accuracies = + let as = zip intClasses $ map (\c -> filter ((==c) . fst) results) intClasses + av = filter (not . null . snd) as + calculated = map (fst &&& accuracy . snd) av + in sortBy (\(_, a) (_, b) -> b `compare` a) calculated + + when verbose $ + mapM_ (\(c, a) -> putStrLn $ "Accuracy(" ++ classes !! c ++ ") = " ++ show a) accuracies + + putStrLn $ "\nAverages: " + putStrLn $ "Recall = " ++ show (recall results) + putStrLn $ "Precision = " ++ show (precision results) + putStrLn $ "F Measure = " ++ show (fmeasure results) + putStrLn $ "Accuracy = " ++ show (accuracy results) createDocuments classes content = let splitted = splitOn (replicate 10 '-' ++ "\n") content diff --git a/src/Sibe/NaiveBayes.hs b/src/Sibe/NaiveBayes.hs index 0cf17bd..db10970 100644 --- a/src/Sibe/NaiveBayes.hs +++ b/src/Sibe/NaiveBayes.hs @@ -3,6 +3,7 @@ module Sibe.NaiveBayes NB(..), train, run, + session, ordNub, accuracy, precision, @@ -91,22 +92,30 @@ module Sibe.NaiveBayes grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs in unwords ("":grams) + session :: [Document] -> NB -> [(Class, (Class, Double))] + session docs nb = + let results = map (\(Document text c) -> (c, run text nb)) docs + in results + run :: String -> NB -> (Class, Double) run txt (NB documents classes vocabulary megadoc cd cw cgram) = let scores = map (score . fst) classes index = argmax scores m = maximum scores - confidence = m / sum scores - in (index, 0) + in (index, m) where score c = let prior = snd (classes !! c) -- below is the formula according to Multinominal Naive Bayes, but it seems - -- using a uniform prior probability seems to work better + -- using a uniform prior probability seems to work better when working with imbalanced + -- training datasets, instead, we help rare classes get higher scores using + -- alpha = (1 - prior * ALPHA) -- in prior * product (map (prob c) (words txt)) - in product (map (prob c) (words txt)) + alpha = 1 - (log 1 + prior) + + in alpha * product (map (prob c) (words txt)) prob c w = let tctM = find ((== w) . fst) (snd (cw !! c))