feat(verbose): print more information using -v or --verbose flags

This commit is contained in:
Mahdi Dibaiee 2016-08-08 12:35:26 +04:30
parent 099c25e166
commit eebf5e0222
2 changed files with 44 additions and 12 deletions

View File

@ -8,32 +8,55 @@ module Main
import Debug.Trace import Debug.Trace
import Data.List.Split import Data.List.Split
import Control.Arrow ((&&&)) import Control.Arrow ((&&&))
import Control.Monad (when)
import System.Environment
main = do main = do
args <- getArgs
dataset <- readFile "examples/doc-classifier-data/data-reuters" dataset <- readFile "examples/doc-classifier-data/data-reuters"
test <- readFile "examples/doc-classifier-data/data-reuters-test" test <- readFile "examples/doc-classifier-data/data-reuters-test"
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes" classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
sws <- lines <$> readFile "examples/stopwords" sws <- lines <$> readFile "examples/stopwords"
let verbose = or [elem "-v" args, elem "--verbose" args]
when (not verbose) $ putStrLn "use --verbose to print more information"
let intClasses = [0..length classes - 1] let intClasses = [0..length classes - 1]
documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
testDocuments = cleanDocuments $ createDocuments classes test testDocuments = cleanDocuments $ createDocuments classes test
devTestDocuments = take 30 testDocuments devTestDocuments = take 30 testDocuments
nb = train documents intClasses nb = train documents intClasses
results = map (\(Document text c) -> (c, run text nb)) testDocuments results = session testDocuments nb
-- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments -- results = session devTestDocuments nb
print (text $ head documents) when verbose $ print (text $ head documents)
let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r) let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
mapM_ showResults results when verbose $ mapM_ showResults results
putStrLn $ "Recall: " ++ show (recall results) when verbose $
putStrLn $ "Precision: " ++ show (precision results) putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n"
putStrLn $ "F Measure: " ++ show (fmeasure results) ++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n"
putStrLn $ "Accuracy: " ++ show (accuracy results) ++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n"
++ "are incorrectly classified as `earn`.\n"
let
accuracies =
let as = zip intClasses $ map (\c -> filter ((==c) . fst) results) intClasses
av = filter (not . null . snd) as
calculated = map (fst &&& accuracy . snd) av
in sortBy (\(_, a) (_, b) -> b `compare` a) calculated
when verbose $
mapM_ (\(c, a) -> putStrLn $ "Accuracy(" ++ classes !! c ++ ") = " ++ show a) accuracies
putStrLn $ "\nAverages: "
putStrLn $ "Recall = " ++ show (recall results)
putStrLn $ "Precision = " ++ show (precision results)
putStrLn $ "F Measure = " ++ show (fmeasure results)
putStrLn $ "Accuracy = " ++ show (accuracy results)
createDocuments classes content = createDocuments classes content =
let splitted = splitOn (replicate 10 '-' ++ "\n") content let splitted = splitOn (replicate 10 '-' ++ "\n") content

View File

@ -3,6 +3,7 @@ module Sibe.NaiveBayes
NB(..), NB(..),
train, train,
run, run,
session,
ordNub, ordNub,
accuracy, accuracy,
precision, precision,
@ -91,22 +92,30 @@ module Sibe.NaiveBayes
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
in unwords ("<b>":grams) in unwords ("<b>":grams)
session :: [Document] -> NB -> [(Class, (Class, Double))]
session docs nb =
let results = map (\(Document text c) -> (c, run text nb)) docs
in results
run :: String -> NB -> (Class, Double) run :: String -> NB -> (Class, Double)
run txt (NB documents classes vocabulary megadoc cd cw cgram) = run txt (NB documents classes vocabulary megadoc cd cw cgram) =
let scores = map (score . fst) classes let scores = map (score . fst) classes
index = argmax scores index = argmax scores
m = maximum scores m = maximum scores
confidence = m / sum scores in (index, m)
in (index, 0)
where where
score c = score c =
let prior = snd (classes !! c) let prior = snd (classes !! c)
-- below is the formula according to Multinominal Naive Bayes, but it seems -- below is the formula according to Multinominal Naive Bayes, but it seems
-- using a uniform prior probability seems to work better -- using a uniform prior probability seems to work better when working with imbalanced
-- training datasets, instead, we help rare classes get higher scores using
-- alpha = (1 - prior * ALPHA)
-- in prior * product (map (prob c) (words txt)) -- in prior * product (map (prob c) (words txt))
in product (map (prob c) (words txt)) alpha = 1 - (log 1 + prior)
in alpha * product (map (prob c) (words txt))
prob c w = prob c w =
let tctM = find ((== w) . fst) (snd (cw !! c)) let tctM = find ((== w) . fst) (snd (cw !! c))