feat(topten): top-ten classification with evenly distrubuted data
This commit is contained in:
parent
b2888417bb
commit
891f48a2d0
@ -41,8 +41,11 @@ stack exec example-xor
|
|||||||
|
|
||||||
# Naive Bayes document classifier, using Reuters dataset, achieves ~62% accuracy
|
# Naive Bayes document classifier, using Reuters dataset, achieves ~62% accuracy
|
||||||
# using Porter stemming, stopword elimination and a few custom techniques.
|
# using Porter stemming, stopword elimination and a few custom techniques.
|
||||||
# the dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...)
|
# The dataset is imbalanced which causes the classifier to be biased towards some classes (earn, acq, ...)
|
||||||
|
# to workaround the imbalanced dataset problem, there is a --top-ten option which classifies only top 10 popular
|
||||||
|
# classes, with evenly split datasets (100 for each)
|
||||||
# N-Grams don't seem to help us much here (or maybe my implementation is wrong!), using bigrams increases
|
# N-Grams don't seem to help us much here (or maybe my implementation is wrong!), using bigrams increases
|
||||||
# accuracy, while decreasing F-Measure slightly.
|
# accuracy, while decreasing F-Measure slightly.
|
||||||
stack exec example-naivebayes-doc-classifier -- --verbose
|
stack exec example-naivebayes-doc-classifier -- --verbose
|
||||||
|
stack exec example-naivebayes-doc-classifier -- --verbose --top-ten
|
||||||
```
|
```
|
||||||
|
@ -1 +0,0 @@
|
|||||||
Subproject commit 5b069a54a6a68efee0ef4bb15c1aa56414f12c28
|
|
@ -8,7 +8,8 @@ module Main
|
|||||||
import Debug.Trace
|
import Debug.Trace
|
||||||
import Data.List.Split
|
import Data.List.Split
|
||||||
import Control.Arrow ((&&&))
|
import Control.Arrow ((&&&))
|
||||||
import Control.Monad (when)
|
import Control.Monad (when, unless)
|
||||||
|
import Data.Function (on)
|
||||||
import System.Environment
|
import System.Environment
|
||||||
|
|
||||||
main = do
|
main = do
|
||||||
@ -19,32 +20,47 @@ module Main
|
|||||||
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
|
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
|
||||||
sws <- lines <$> readFile "examples/stopwords"
|
sws <- lines <$> readFile "examples/stopwords"
|
||||||
|
|
||||||
let verbose = or [elem "-v" args, elem "--verbose" args]
|
let verbose = elem "-v" args || elem "--verbose" args
|
||||||
when (not verbose) $ putStrLn "use --verbose to print more information"
|
topten = elem "-10" args || elem "--top-ten" args
|
||||||
|
unless verbose $ putStrLn "use --verbose to print more information"
|
||||||
|
|
||||||
let intClasses = [0..length classes - 1]
|
let intClasses = [0..length classes - 1]
|
||||||
documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
|
documents = cleanDocuments . removeWords sws $ createDocuments classes dataset
|
||||||
testDocuments = cleanDocuments $ createDocuments classes test
|
testDocuments = cleanDocuments $ createDocuments classes test
|
||||||
devTestDocuments = take 30 testDocuments
|
|
||||||
nb = train documents intClasses
|
nb = train documents intClasses
|
||||||
|
|
||||||
results = session testDocuments nb
|
-- top-ten
|
||||||
|
topClasses = take 10 . reverse $ sortBy (compare `on` (length . snd)) (cd nb)
|
||||||
|
filtered = map (\(c, ds) -> (c, take 100 ds)) topClasses
|
||||||
|
filteredClasses = map fst filtered
|
||||||
|
ttDocs = concatMap snd filtered
|
||||||
|
ttNB = train ttDocs filteredClasses
|
||||||
|
|
||||||
|
ttTestDocuments = filter ((`elem` filteredClasses) . c) . cleanDocuments $ createDocuments classes test
|
||||||
|
|
||||||
|
ttResults = session ttTestDocuments ttNB
|
||||||
|
normalResults = session testDocuments nb
|
||||||
|
results = if topten then ttResults else normalResults
|
||||||
|
|
||||||
|
iClasses = if topten then filteredClasses else intClasses
|
||||||
-- results = session devTestDocuments nb
|
-- results = session devTestDocuments nb
|
||||||
|
|
||||||
when verbose $ print (text $ head documents)
|
when verbose . putStrLn $ "# Example of cleaned document:\n" ++ (show . text $ head documents)
|
||||||
|
|
||||||
let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
|
let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
|
||||||
when verbose $ mapM_ showResults results
|
when verbose $ mapM_ showResults results
|
||||||
|
|
||||||
when verbose $
|
when (verbose && not topten) .
|
||||||
putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n"
|
putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n"
|
||||||
++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n"
|
++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n"
|
||||||
++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n"
|
++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n"
|
||||||
++ "are incorrectly classified as `earn`.\n"
|
++ "are incorrectly classified as `earn`.\n"
|
||||||
|
++ "Try running with --top-ten to classify top 10 classes by using evenly split documents\n"
|
||||||
|
|
||||||
let
|
let
|
||||||
accuracies =
|
accuracies =
|
||||||
let as = zip intClasses $ map (\c -> filter ((==c) . fst) results) intClasses
|
let as = zip iClasses $ map (\c -> filter ((==c) . fst) results) iClasses
|
||||||
av = filter (not . null . snd) as
|
av = filter (not . null . snd) as
|
||||||
calculated = map (fst &&& accuracy . snd) av
|
calculated = map (fst &&& accuracy . snd) av
|
||||||
in sortBy (\(_, a) (_, b) -> b `compare` a) calculated
|
in sortBy (\(_, a) (_, b) -> b `compare` a) calculated
|
||||||
|
@ -1 +0,0 @@
|
|||||||
../../sibe-repos/sentiment-analysis-data
|
|
5
examples/sentiment-analysis-data
Executable file
5
examples/sentiment-analysis-data
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
XSym
|
||||||
|
0040
|
||||||
|
3666c4cacaf995ebd11ef25aab70de99
|
||||||
|
../../sibe-repos/sentiment-analysis-data
|
||||||
|
|
@ -14,6 +14,7 @@ module Sibe.NaiveBayes
|
|||||||
cleanText,
|
cleanText,
|
||||||
cleanDocuments,
|
cleanDocuments,
|
||||||
ngram,
|
ngram,
|
||||||
|
ngramText,
|
||||||
removeWords,
|
removeWords,
|
||||||
removeStopwords,
|
removeStopwords,
|
||||||
)
|
)
|
||||||
@ -84,13 +85,14 @@ module Sibe.NaiveBayes
|
|||||||
|
|
||||||
ngram :: Int -> [Document] -> [Document]
|
ngram :: Int -> [Document] -> [Document]
|
||||||
ngram n documents =
|
ngram n documents =
|
||||||
map (\(Document text c) -> Document (helper text) c) documents
|
map (\(Document text c) -> Document (ngramText n text) c) documents
|
||||||
where
|
|
||||||
helper text =
|
ngramText :: Int -> String -> String
|
||||||
|
ngramText n text =
|
||||||
let ws = words text
|
let ws = words text
|
||||||
pairs = zip [0..] ws
|
pairs = zip [0..] ws
|
||||||
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
|
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
|
||||||
in unwords ("<b>":grams)
|
in unwords ("<b>_":grams)
|
||||||
|
|
||||||
session :: [Document] -> NB -> [(Class, (Class, Double))]
|
session :: [Document] -> NB -> [(Class, (Class, Double))]
|
||||||
session docs nb =
|
session docs nb =
|
||||||
@ -102,10 +104,10 @@ module Sibe.NaiveBayes
|
|||||||
let scores = map (score . fst) classes
|
let scores = map (score . fst) classes
|
||||||
index = argmax scores
|
index = argmax scores
|
||||||
m = maximum scores
|
m = maximum scores
|
||||||
in (index, m)
|
in (fst (classes !! index), m)
|
||||||
where
|
where
|
||||||
score c =
|
score c =
|
||||||
let prior = snd (classes !! c)
|
let prior = snd (fromJust $ find ((==c) . fst) classes)
|
||||||
|
|
||||||
-- below is the formula according to Multinominal Naive Bayes, but it seems
|
-- below is the formula according to Multinominal Naive Bayes, but it seems
|
||||||
-- using a uniform prior probability seems to work better when working with imbalanced
|
-- using a uniform prior probability seems to work better when working with imbalanced
|
||||||
@ -118,11 +120,13 @@ module Sibe.NaiveBayes
|
|||||||
in alpha * product (map (prob c) (words txt))
|
in alpha * product (map (prob c) (words txt))
|
||||||
|
|
||||||
prob c w =
|
prob c w =
|
||||||
let tctM = find ((== w) . fst) (snd (cw !! c))
|
let fcw = fromJust $ find ((==c) . fst) cw
|
||||||
|
fcg = fromJust $ find ((==c) . fst) cgram
|
||||||
|
tctM = find ((== w) . fst) (snd fcw)
|
||||||
tct = if isJust tctM then (snd . fromJust) tctM else 0
|
tct = if isJust tctM then (snd . fromJust) tctM else 0
|
||||||
cvoc = sum $ map snd (snd (cw !! c))
|
cvoc = sum $ map snd (snd fcw)
|
||||||
voc = vocabulary
|
voc = vocabulary
|
||||||
gram = find ((==w) . last . splitOn "_" . fst) (snd (cgram !! c))
|
gram = find ((==w) . last . splitOn "_" . fst) (snd fcg)
|
||||||
pg = if isJust gram then (snd . fromJust) gram else 0
|
pg = if isJust gram then (snd . fromJust) gram else 0
|
||||||
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
|
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
|
||||||
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
||||||
|
Loading…
Reference in New Issue
Block a user