feat(topten): top-ten classification with evenly distrubuted data
This commit is contained in:
		 Submodule examples/doc-classifier-data deleted from 5b069a54a6
									
								
							@@ -8,7 +8,8 @@ module Main
 | 
			
		||||
    import Debug.Trace
 | 
			
		||||
    import Data.List.Split
 | 
			
		||||
    import Control.Arrow ((&&&))
 | 
			
		||||
    import Control.Monad (when)
 | 
			
		||||
    import Control.Monad (when, unless)
 | 
			
		||||
    import Data.Function (on)
 | 
			
		||||
    import System.Environment
 | 
			
		||||
 | 
			
		||||
    main = do
 | 
			
		||||
@@ -19,32 +20,47 @@ module Main
 | 
			
		||||
      classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
 | 
			
		||||
      sws <- lines <$> readFile "examples/stopwords"
 | 
			
		||||
 | 
			
		||||
      let verbose = or [elem "-v" args, elem "--verbose" args]
 | 
			
		||||
      when (not verbose) $ putStrLn "use --verbose to print more information"
 | 
			
		||||
      let verbose = elem "-v" args || elem "--verbose" args
 | 
			
		||||
          topten  = elem "-10" args || elem "--top-ten" args
 | 
			
		||||
      unless verbose $ putStrLn "use --verbose to print more information"
 | 
			
		||||
 | 
			
		||||
      let intClasses = [0..length classes - 1]
 | 
			
		||||
          documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
 | 
			
		||||
          documents = cleanDocuments . removeWords sws $ createDocuments classes dataset
 | 
			
		||||
          testDocuments = cleanDocuments $ createDocuments classes test
 | 
			
		||||
          devTestDocuments = take 30 testDocuments
 | 
			
		||||
 | 
			
		||||
          nb = train documents intClasses
 | 
			
		||||
 | 
			
		||||
          results = session testDocuments nb
 | 
			
		||||
          -- top-ten
 | 
			
		||||
          topClasses = take 10 . reverse $ sortBy (compare `on` (length . snd)) (cd nb)
 | 
			
		||||
          filtered = map (\(c, ds) -> (c, take 100 ds)) topClasses
 | 
			
		||||
          filteredClasses = map fst filtered
 | 
			
		||||
          ttDocs = concatMap snd filtered
 | 
			
		||||
          ttNB = train ttDocs filteredClasses
 | 
			
		||||
 | 
			
		||||
          ttTestDocuments = filter ((`elem` filteredClasses) . c) . cleanDocuments $ createDocuments classes test
 | 
			
		||||
 | 
			
		||||
          ttResults = session ttTestDocuments ttNB
 | 
			
		||||
          normalResults = session testDocuments nb
 | 
			
		||||
          results = if topten then ttResults else normalResults
 | 
			
		||||
 | 
			
		||||
          iClasses = if topten then filteredClasses else intClasses
 | 
			
		||||
          -- results = session devTestDocuments nb
 | 
			
		||||
 | 
			
		||||
      when verbose $ print (text $ head documents)
 | 
			
		||||
      when verbose . putStrLn $ "# Example of cleaned document:\n" ++ (show . text $ head documents)
 | 
			
		||||
 | 
			
		||||
      let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
 | 
			
		||||
      when verbose $ mapM_ showResults results
 | 
			
		||||
 | 
			
		||||
      when verbose $
 | 
			
		||||
      when (verbose && not topten) .
 | 
			
		||||
        putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n"
 | 
			
		||||
                ++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n"
 | 
			
		||||
                ++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n"
 | 
			
		||||
                ++ "are incorrectly classified as `earn`.\n"
 | 
			
		||||
                ++ "Try running with --top-ten to classify top 10 classes by using evenly split documents\n"
 | 
			
		||||
 | 
			
		||||
      let
 | 
			
		||||
        accuracies =
 | 
			
		||||
          let as = zip intClasses $ map (\c -> filter ((==c) . fst) results) intClasses
 | 
			
		||||
          let as = zip iClasses $ map (\c -> filter ((==c) . fst) results) iClasses
 | 
			
		||||
              av = filter (not . null . snd) as
 | 
			
		||||
              calculated = map (fst &&& accuracy . snd) av
 | 
			
		||||
          in sortBy (\(_, a) (_, b) -> b `compare` a) calculated
 | 
			
		||||
 
 | 
			
		||||
@@ -1 +0,0 @@
 | 
			
		||||
../../sibe-repos/sentiment-analysis-data
 | 
			
		||||
							
								
								
									
										5
									
								
								examples/sentiment-analysis-data
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										5
									
								
								examples/sentiment-analysis-data
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,5 @@
 | 
			
		||||
XSym
 | 
			
		||||
0040
 | 
			
		||||
3666c4cacaf995ebd11ef25aab70de99
 | 
			
		||||
../../sibe-repos/sentiment-analysis-data
 | 
			
		||||
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
 | 
			
		||||
		Reference in New Issue
	
	Block a user