feat(verbose): print more information using -v or --verbose flags
This commit is contained in:
parent
099c25e166
commit
eebf5e0222
@ -8,32 +8,55 @@ module Main
|
|||||||
import Debug.Trace
|
import Debug.Trace
|
||||||
import Data.List.Split
|
import Data.List.Split
|
||||||
import Control.Arrow ((&&&))
|
import Control.Arrow ((&&&))
|
||||||
|
import Control.Monad (when)
|
||||||
|
import System.Environment
|
||||||
|
|
||||||
main = do
|
main = do
|
||||||
|
args <- getArgs
|
||||||
dataset <- readFile "examples/doc-classifier-data/data-reuters"
|
dataset <- readFile "examples/doc-classifier-data/data-reuters"
|
||||||
test <- readFile "examples/doc-classifier-data/data-reuters-test"
|
test <- readFile "examples/doc-classifier-data/data-reuters-test"
|
||||||
|
|
||||||
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
|
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
|
||||||
sws <- lines <$> readFile "examples/stopwords"
|
sws <- lines <$> readFile "examples/stopwords"
|
||||||
|
|
||||||
|
let verbose = or [elem "-v" args, elem "--verbose" args]
|
||||||
|
when (not verbose) $ putStrLn "use --verbose to print more information"
|
||||||
|
|
||||||
let intClasses = [0..length classes - 1]
|
let intClasses = [0..length classes - 1]
|
||||||
documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
|
documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
|
||||||
testDocuments = cleanDocuments $ createDocuments classes test
|
testDocuments = cleanDocuments $ createDocuments classes test
|
||||||
devTestDocuments = take 30 testDocuments
|
devTestDocuments = take 30 testDocuments
|
||||||
nb = train documents intClasses
|
nb = train documents intClasses
|
||||||
|
|
||||||
results = map (\(Document text c) -> (c, run text nb)) testDocuments
|
results = session testDocuments nb
|
||||||
-- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
|
-- results = session devTestDocuments nb
|
||||||
|
|
||||||
print (text $ head documents)
|
when verbose $ print (text $ head documents)
|
||||||
|
|
||||||
let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
|
let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
|
||||||
mapM_ showResults results
|
when verbose $ mapM_ showResults results
|
||||||
|
|
||||||
putStrLn $ "Recall: " ++ show (recall results)
|
when verbose $
|
||||||
putStrLn $ "Precision: " ++ show (precision results)
|
putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n"
|
||||||
putStrLn $ "F Measure: " ++ show (fmeasure results)
|
++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n"
|
||||||
putStrLn $ "Accuracy: " ++ show (accuracy results)
|
++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n"
|
||||||
|
++ "are incorrectly classified as `earn`.\n"
|
||||||
|
|
||||||
|
let
|
||||||
|
accuracies =
|
||||||
|
let as = zip intClasses $ map (\c -> filter ((==c) . fst) results) intClasses
|
||||||
|
av = filter (not . null . snd) as
|
||||||
|
calculated = map (fst &&& accuracy . snd) av
|
||||||
|
in sortBy (\(_, a) (_, b) -> b `compare` a) calculated
|
||||||
|
|
||||||
|
when verbose $
|
||||||
|
mapM_ (\(c, a) -> putStrLn $ "Accuracy(" ++ classes !! c ++ ") = " ++ show a) accuracies
|
||||||
|
|
||||||
|
putStrLn $ "\nAverages: "
|
||||||
|
putStrLn $ "Recall = " ++ show (recall results)
|
||||||
|
putStrLn $ "Precision = " ++ show (precision results)
|
||||||
|
putStrLn $ "F Measure = " ++ show (fmeasure results)
|
||||||
|
putStrLn $ "Accuracy = " ++ show (accuracy results)
|
||||||
|
|
||||||
createDocuments classes content =
|
createDocuments classes content =
|
||||||
let splitted = splitOn (replicate 10 '-' ++ "\n") content
|
let splitted = splitOn (replicate 10 '-' ++ "\n") content
|
||||||
|
@ -3,6 +3,7 @@ module Sibe.NaiveBayes
|
|||||||
NB(..),
|
NB(..),
|
||||||
train,
|
train,
|
||||||
run,
|
run,
|
||||||
|
session,
|
||||||
ordNub,
|
ordNub,
|
||||||
accuracy,
|
accuracy,
|
||||||
precision,
|
precision,
|
||||||
@ -91,22 +92,30 @@ module Sibe.NaiveBayes
|
|||||||
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
|
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
|
||||||
in unwords ("<b>":grams)
|
in unwords ("<b>":grams)
|
||||||
|
|
||||||
|
session :: [Document] -> NB -> [(Class, (Class, Double))]
|
||||||
|
session docs nb =
|
||||||
|
let results = map (\(Document text c) -> (c, run text nb)) docs
|
||||||
|
in results
|
||||||
|
|
||||||
run :: String -> NB -> (Class, Double)
|
run :: String -> NB -> (Class, Double)
|
||||||
run txt (NB documents classes vocabulary megadoc cd cw cgram) =
|
run txt (NB documents classes vocabulary megadoc cd cw cgram) =
|
||||||
let scores = map (score . fst) classes
|
let scores = map (score . fst) classes
|
||||||
index = argmax scores
|
index = argmax scores
|
||||||
m = maximum scores
|
m = maximum scores
|
||||||
confidence = m / sum scores
|
in (index, m)
|
||||||
in (index, 0)
|
|
||||||
where
|
where
|
||||||
score c =
|
score c =
|
||||||
let prior = snd (classes !! c)
|
let prior = snd (classes !! c)
|
||||||
|
|
||||||
-- below is the formula according to Multinominal Naive Bayes, but it seems
|
-- below is the formula according to Multinominal Naive Bayes, but it seems
|
||||||
-- using a uniform prior probability seems to work better
|
-- using a uniform prior probability seems to work better when working with imbalanced
|
||||||
|
-- training datasets, instead, we help rare classes get higher scores using
|
||||||
|
-- alpha = (1 - prior * ALPHA)
|
||||||
-- in prior * product (map (prob c) (words txt))
|
-- in prior * product (map (prob c) (words txt))
|
||||||
|
|
||||||
in product (map (prob c) (words txt))
|
alpha = 1 - (log 1 + prior)
|
||||||
|
|
||||||
|
in alpha * product (map (prob c) (words txt))
|
||||||
|
|
||||||
prob c w =
|
prob c w =
|
||||||
let tctM = find ((== w) . fst) (snd (cw !! c))
|
let tctM = find ((== w) . fst) (snd (cw !! c))
|
||||||
|
Loading…
Reference in New Issue
Block a user