feat(topten): top-ten classification with evenly distrubuted data

This commit is contained in:
Mahdi Dibaiee 2016-08-21 00:59:42 +04:30
parent b2888417bb
commit 891f48a2d0
5 changed files with 50 additions and 24 deletions

View File

@ -41,8 +41,11 @@ stack exec example-xor
# Naive Bayes document classifier, using Reuters dataset, achieves ~62% accuracy # Naive Bayes document classifier, using Reuters dataset, achieves ~62% accuracy
# using Porter stemming, stopword elimination and a few custom techniques. # using Porter stemming, stopword elimination and a few custom techniques.
# the dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...) # The dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...)
# to workaround the imbalanced dataset problem, there is a --top-ten option which classifies only top 10 popular
# classes, with evenly split datasets (100 for each)
# N-Grams don't seem to help us much here (or maybe my implementation is wrong!), using bigrams increases # N-Grams don't seem to help us much here (or maybe my implementation is wrong!), using bigrams increases
# accuracy, while decreasing F-Measure slightly. # accuracy, while decreasing F-Measure slightly.
stack exec example-naivebayes-doc-classifier -- --verbose stack exec example-naivebayes-doc-classifier -- --verbose
stack exec example-naivebayes-doc-classifier -- --verbose --top-ten
``` ```

@ -1 +0,0 @@
Subproject commit 5b069a54a6a68efee0ef4bb15c1aa56414f12c28

View File

@ -8,7 +8,8 @@ module Main
import Debug.Trace import Debug.Trace
import Data.List.Split import Data.List.Split
import Control.Arrow ((&&&)) import Control.Arrow ((&&&))
import Control.Monad (when) import Control.Monad (when, unless)
import Data.Function (on)
import System.Environment import System.Environment
main = do main = do
@ -19,32 +20,47 @@ module Main
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes" classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
sws <- lines <$> readFile "examples/stopwords" sws <- lines <$> readFile "examples/stopwords"
let verbose = or [elem "-v" args, elem "--verbose" args] let verbose = elem "-v" args || elem "--verbose" args
when (not verbose) $ putStrLn "use --verbose to print more information" topten = elem "-10" args || elem "--top-ten" args
unless verbose $ putStrLn "use --verbose to print more information"
let intClasses = [0..length classes - 1] let intClasses = [0..length classes - 1]
documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset documents = cleanDocuments . removeWords sws $ createDocuments classes dataset
testDocuments = cleanDocuments $ createDocuments classes test testDocuments = cleanDocuments $ createDocuments classes test
devTestDocuments = take 30 testDocuments
nb = train documents intClasses nb = train documents intClasses
results = session testDocuments nb -- top-ten
topClasses = take 10 . reverse $ sortBy (compare `on` (length . snd)) (cd nb)
filtered = map (\(c, ds) -> (c, take 100 ds)) topClasses
filteredClasses = map fst filtered
ttDocs = concatMap snd filtered
ttNB = train ttDocs filteredClasses
ttTestDocuments = filter ((`elem` filteredClasses) . c) . cleanDocuments $ createDocuments classes test
ttResults = session ttTestDocuments ttNB
normalResults = session testDocuments nb
results = if topten then ttResults else normalResults
iClasses = if topten then filteredClasses else intClasses
-- results = session devTestDocuments nb -- results = session devTestDocuments nb
when verbose $ print (text $ head documents) when verbose . putStrLn $ "# Example of cleaned document:\n" ++ (show . text $ head documents)
let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r) let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
when verbose $ mapM_ showResults results when verbose $ mapM_ showResults results
when verbose $ when (verbose && not topten) .
putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n" putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n"
++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n" ++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n"
++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n" ++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n"
++ "are incorrectly classified as `earn`.\n" ++ "are incorrectly classified as `earn`.\n"
++ "Try running with --top-ten to classify top 10 classes by using evenly split documents\n"
let let
accuracies = accuracies =
let as = zip intClasses $ map (\c -> filter ((==c) . fst) results) intClasses let as = zip iClasses $ map (\c -> filter ((==c) . fst) results) iClasses
av = filter (not . null . snd) as av = filter (not . null . snd) as
calculated = map (fst &&& accuracy . snd) av calculated = map (fst &&& accuracy . snd) av
in sortBy (\(_, a) (_, b) -> b `compare` a) calculated in sortBy (\(_, a) (_, b) -> b `compare` a) calculated

View File

@ -1 +0,0 @@
../../sibe-repos/sentiment-analysis-data

View File

@ -0,0 +1,5 @@
XSym
0040
3666c4cacaf995ebd11ef25aab70de99
../../sibe-repos/sentiment-analysis-data

View File

@ -14,6 +14,7 @@ module Sibe.NaiveBayes
cleanText, cleanText,
cleanDocuments, cleanDocuments,
ngram, ngram,
ngramText,
removeWords, removeWords,
removeStopwords, removeStopwords,
) )
@ -84,13 +85,14 @@ module Sibe.NaiveBayes
ngram :: Int -> [Document] -> [Document] ngram :: Int -> [Document] -> [Document]
ngram n documents = ngram n documents =
map (\(Document text c) -> Document (helper text) c) documents map (\(Document text c) -> Document (ngramText n text) c) documents
where
helper text = ngramText :: Int -> String -> String
ngramText n text =
let ws = words text let ws = words text
pairs = zip [0..] ws pairs = zip [0..] ws
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
in unwords ("<b>":grams) in unwords ("<b>_":grams)
session :: [Document] -> NB -> [(Class, (Class, Double))] session :: [Document] -> NB -> [(Class, (Class, Double))]
session docs nb = session docs nb =
@ -102,10 +104,10 @@ module Sibe.NaiveBayes
let scores = map (score . fst) classes let scores = map (score . fst) classes
index = argmax scores index = argmax scores
m = maximum scores m = maximum scores
in (index, m) in (fst (classes !! index), m)
where where
score c = score c =
let prior = snd (classes !! c) let prior = snd (fromJust $ find ((==c) . fst) classes)
-- below is the formula according to Multinominal Naive Bayes, but it seems -- below is the formula according to Multinominal Naive Bayes, but it seems
-- using a uniform prior probability seems to work better when working with imbalanced -- using a uniform prior probability seems to work better when working with imbalanced
@ -118,11 +120,13 @@ module Sibe.NaiveBayes
in alpha * product (map (prob c) (words txt)) in alpha * product (map (prob c) (words txt))
prob c w = prob c w =
let tctM = find ((== w) . fst) (snd (cw !! c)) let fcw = fromJust $ find ((==c) . fst) cw
fcg = fromJust $ find ((==c) . fst) cgram
tctM = find ((== w) . fst) (snd fcw)
tct = if isJust tctM then (snd . fromJust) tctM else 0 tct = if isJust tctM then (snd . fromJust) tctM else 0
cvoc = sum $ map snd (snd (cw !! c)) cvoc = sum $ map snd (snd fcw)
voc = vocabulary voc = vocabulary
gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c)) gram = find ((==w) . last . splitOn "_" . fst) (snd fcg)
pg = if isJust gram then (snd . fromJust) gram else 0 pg = if isJust gram then (snd . fromJust) gram else 0
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams -- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
in realToFrac (tct + 1) / realToFrac (cvoc + voc) in realToFrac (tct + 1) / realToFrac (cvoc + voc)