feat(topten): top-ten classification with evenly distrubuted data
This commit is contained in:
Submodule examples/doc-classifier-data deleted from 5b069a54a6
@ -8,7 +8,8 @@ module Main
|
||||
import Debug.Trace
|
||||
import Data.List.Split
|
||||
import Control.Arrow ((&&&))
|
||||
import Control.Monad (when)
|
||||
import Control.Monad (when, unless)
|
||||
import Data.Function (on)
|
||||
import System.Environment
|
||||
|
||||
main = do
|
||||
@ -19,32 +20,47 @@ module Main
|
||||
classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
|
||||
sws <- lines <$> readFile "examples/stopwords"
|
||||
|
||||
let verbose = or [elem "-v" args, elem "--verbose" args]
|
||||
when (not verbose) $ putStrLn "use --verbose to print more information"
|
||||
let verbose = elem "-v" args || elem "--verbose" args
|
||||
topten = elem "-10" args || elem "--top-ten" args
|
||||
unless verbose $ putStrLn "use --verbose to print more information"
|
||||
|
||||
let intClasses = [0..length classes - 1]
|
||||
documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
|
||||
documents = cleanDocuments . removeWords sws $ createDocuments classes dataset
|
||||
testDocuments = cleanDocuments $ createDocuments classes test
|
||||
devTestDocuments = take 30 testDocuments
|
||||
|
||||
nb = train documents intClasses
|
||||
|
||||
results = session testDocuments nb
|
||||
-- top-ten
|
||||
topClasses = take 10 . reverse $ sortBy (compare `on` (length . snd)) (cd nb)
|
||||
filtered = map (\(c, ds) -> (c, take 100 ds)) topClasses
|
||||
filteredClasses = map fst filtered
|
||||
ttDocs = concatMap snd filtered
|
||||
ttNB = train ttDocs filteredClasses
|
||||
|
||||
ttTestDocuments = filter ((`elem` filteredClasses) . c) . cleanDocuments $ createDocuments classes test
|
||||
|
||||
ttResults = session ttTestDocuments ttNB
|
||||
normalResults = session testDocuments nb
|
||||
results = if topten then ttResults else normalResults
|
||||
|
||||
iClasses = if topten then filteredClasses else intClasses
|
||||
-- results = session devTestDocuments nb
|
||||
|
||||
when verbose $ print (text $ head documents)
|
||||
when verbose . putStrLn $ "# Example of cleaned document:\n" ++ (show . text $ head documents)
|
||||
|
||||
let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
|
||||
when verbose $ mapM_ showResults results
|
||||
|
||||
when verbose $
|
||||
when (verbose && not topten) .
|
||||
putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n"
|
||||
++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n"
|
||||
++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n"
|
||||
++ "are incorrectly classified as `earn`.\n"
|
||||
++ "Try running with --top-ten to classify top 10 classes by using evenly split documents\n"
|
||||
|
||||
let
|
||||
accuracies =
|
||||
let as = zip intClasses $ map (\c -> filter ((==c) . fst) results) intClasses
|
||||
let as = zip iClasses $ map (\c -> filter ((==c) . fst) results) iClasses
|
||||
av = filter (not . null . snd) as
|
||||
calculated = map (fst &&& accuracy . snd) av
|
||||
in sortBy (\(_, a) (_, b) -> b `compare` a) calculated
|
||||
|
@ -1 +0,0 @@
|
||||
../../sibe-repos/sentiment-analysis-data
|
5
examples/sentiment-analysis-data
Executable file
5
examples/sentiment-analysis-data
Executable file
@ -0,0 +1,5 @@
|
||||
XSym
|
||||
0040
|
||||
3666c4cacaf995ebd11ef25aab70de99
|
||||
../../sibe-repos/sentiment-analysis-data
|
||||
|
Reference in New Issue
Block a user