feat(verbose): print more information using -v or --verbose flags
This commit is contained in:
		@@ -8,32 +8,55 @@ module Main
 | 
				
			|||||||
    import Debug.Trace
 | 
					    import Debug.Trace
 | 
				
			||||||
    import Data.List.Split
 | 
					    import Data.List.Split
 | 
				
			||||||
    import Control.Arrow ((&&&))
 | 
					    import Control.Arrow ((&&&))
 | 
				
			||||||
 | 
					    import Control.Monad (when)
 | 
				
			||||||
 | 
					    import System.Environment
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    main = do
 | 
					    main = do
 | 
				
			||||||
 | 
					      args <- getArgs
 | 
				
			||||||
      dataset <- readFile "examples/doc-classifier-data/data-reuters"
 | 
					      dataset <- readFile "examples/doc-classifier-data/data-reuters"
 | 
				
			||||||
      test <- readFile "examples/doc-classifier-data/data-reuters-test"
 | 
					      test <- readFile "examples/doc-classifier-data/data-reuters-test"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
 | 
					      classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
 | 
				
			||||||
      sws <- lines <$> readFile "examples/stopwords"
 | 
					      sws <- lines <$> readFile "examples/stopwords"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      let verbose = or [elem "-v" args, elem "--verbose" args]
 | 
				
			||||||
 | 
					      when (not verbose) $ putStrLn "use --verbose to print more information"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      let intClasses = [0..length classes - 1]
 | 
					      let intClasses = [0..length classes - 1]
 | 
				
			||||||
          documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
 | 
					          documents = cleanDocuments $ removeWords sws $ createDocuments classes dataset
 | 
				
			||||||
          testDocuments = cleanDocuments $ createDocuments classes test
 | 
					          testDocuments = cleanDocuments $ createDocuments classes test
 | 
				
			||||||
          devTestDocuments = take 30 testDocuments
 | 
					          devTestDocuments = take 30 testDocuments
 | 
				
			||||||
          nb = train documents intClasses
 | 
					          nb = train documents intClasses
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          results = map (\(Document text c) -> (c, run text nb)) testDocuments
 | 
					          results = session testDocuments nb
 | 
				
			||||||
          -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
 | 
					          -- results = session devTestDocuments nb
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      print (text $ head documents)
 | 
					      when verbose $ print (text $ head documents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
 | 
					      let showResults (c, (r, confidence)) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
 | 
				
			||||||
      mapM_ showResults results
 | 
					      when verbose $ mapM_ showResults results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      putStrLn $ "Recall: " ++ show (recall results)
 | 
					      when verbose $
 | 
				
			||||||
      putStrLn $ "Precision: " ++ show (precision results)
 | 
					        putStrLn $ "The training data is imbalanced which causes the classifier to be biased towards\n"
 | 
				
			||||||
      putStrLn $ "F Measure: " ++ show (fmeasure results)
 | 
					                ++ "some classes, `earn` is an example, the class alone has around 90% accuracy while\n"
 | 
				
			||||||
      putStrLn $ "Accuracy: " ++ show (accuracy results)
 | 
					                ++ "the rest of classes have a much lower accuracy and it's commonly seen that most inputs\n"
 | 
				
			||||||
 | 
					                ++ "are incorrectly classified as `earn`.\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      let
 | 
				
			||||||
 | 
					        accuracies =
 | 
				
			||||||
 | 
					          let as = zip intClasses $ map (\c -> filter ((==c) . fst) results) intClasses
 | 
				
			||||||
 | 
					              av = filter (not . null . snd) as
 | 
				
			||||||
 | 
					              calculated = map (fst &&& accuracy . snd) av
 | 
				
			||||||
 | 
					          in sortBy (\(_, a) (_, b) -> b `compare` a) calculated
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      when verbose $
 | 
				
			||||||
 | 
					        mapM_ (\(c, a) -> putStrLn $ "Accuracy(" ++ classes !! c ++ ") = " ++ show a) accuracies
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      putStrLn $ "\nAverages: "
 | 
				
			||||||
 | 
					      putStrLn $ "Recall = " ++ show (recall results)
 | 
				
			||||||
 | 
					      putStrLn $ "Precision = " ++ show (precision results)
 | 
				
			||||||
 | 
					      putStrLn $ "F Measure = " ++ show (fmeasure results)
 | 
				
			||||||
 | 
					      putStrLn $ "Accuracy = " ++ show (accuracy results)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    createDocuments classes content =
 | 
					    createDocuments classes content =
 | 
				
			||||||
      let splitted = splitOn (replicate 10 '-' ++ "\n") content
 | 
					      let splitted = splitOn (replicate 10 '-' ++ "\n") content
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,6 +3,7 @@ module Sibe.NaiveBayes
 | 
				
			|||||||
   NB(..),
 | 
					   NB(..),
 | 
				
			||||||
   train,
 | 
					   train,
 | 
				
			||||||
   run,
 | 
					   run,
 | 
				
			||||||
 | 
					   session,
 | 
				
			||||||
   ordNub,
 | 
					   ordNub,
 | 
				
			||||||
   accuracy,
 | 
					   accuracy,
 | 
				
			||||||
   precision,
 | 
					   precision,
 | 
				
			||||||
@@ -91,22 +92,30 @@ module Sibe.NaiveBayes
 | 
				
			|||||||
              grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
 | 
					              grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
 | 
				
			||||||
          in unwords ("<b>":grams)
 | 
					          in unwords ("<b>":grams)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    session :: [Document] -> NB -> [(Class, (Class, Double))]
 | 
				
			||||||
 | 
					    session docs nb =
 | 
				
			||||||
 | 
					      let results = map (\(Document text c) -> (c, run text nb)) docs
 | 
				
			||||||
 | 
					      in results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    run :: String -> NB -> (Class, Double)
 | 
					    run :: String -> NB -> (Class, Double)
 | 
				
			||||||
    run txt (NB documents classes vocabulary megadoc cd cw cgram) =
 | 
					    run txt (NB documents classes vocabulary megadoc cd cw cgram) =
 | 
				
			||||||
      let scores = map (score . fst) classes
 | 
					      let scores = map (score . fst) classes
 | 
				
			||||||
          index = argmax scores
 | 
					          index = argmax scores
 | 
				
			||||||
          m = maximum scores
 | 
					          m = maximum scores
 | 
				
			||||||
          confidence = m / sum scores
 | 
					      in (index, m)
 | 
				
			||||||
      in (index, 0)
 | 
					 | 
				
			||||||
      where
 | 
					      where
 | 
				
			||||||
        score c =
 | 
					        score c =
 | 
				
			||||||
          let prior = snd (classes !! c)
 | 
					          let prior = snd (classes !! c)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          -- below is the formula according to Multinominal Naive Bayes, but it seems
 | 
					          -- below is the formula according to Multinominal Naive Bayes, but it seems
 | 
				
			||||||
          -- using a uniform prior probability seems to work better
 | 
					          -- using a uniform prior probability seems to work better when working with imbalanced
 | 
				
			||||||
 | 
					          -- training datasets, instead, we help rare classes get higher scores using
 | 
				
			||||||
 | 
					          -- alpha = (1 - prior * ALPHA)
 | 
				
			||||||
          -- in prior * product (map (prob c) (words txt))
 | 
					          -- in prior * product (map (prob c) (words txt))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          in product (map (prob c) (words txt))
 | 
					              alpha = 1 - (log 1 + prior)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					          in alpha * product (map (prob c) (words txt))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        prob c w =
 | 
					        prob c w =
 | 
				
			||||||
          let tctM = find ((== w) . fst) (snd (cw !! c))
 | 
					          let tctM = find ((== w) . fst) (snd (cw !! c))
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user