fix(naivebayes): fix the algorithm to make it actually work
feat(cleanDocuments): preprocess documents, use stemming and stopword elimination for better accuracy
This commit is contained in:
		@@ -1,12 +1,13 @@
 | 
				
			|||||||
module Main
 | 
					module Main
 | 
				
			||||||
  where
 | 
					  where
 | 
				
			||||||
    import Sibe
 | 
					    -- import Sibe
 | 
				
			||||||
    import Sibe.NaiveBayes
 | 
					    import Sibe.NaiveBayes
 | 
				
			||||||
    import Text.Printf
 | 
					    import Text.Printf
 | 
				
			||||||
    import Data.List
 | 
					    import Data.List
 | 
				
			||||||
    import Data.Maybe
 | 
					    import Data.Maybe
 | 
				
			||||||
    import Debug.Trace
 | 
					    import Debug.Trace
 | 
				
			||||||
    import Data.List.Split
 | 
					    import Data.List.Split
 | 
				
			||||||
 | 
					    import Control.Arrow ((&&&))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    main = do
 | 
					    main = do
 | 
				
			||||||
      dataset <- readFile "examples/doc-classifier-data/data-reuters"
 | 
					      dataset <- readFile "examples/doc-classifier-data/data-reuters"
 | 
				
			||||||
@@ -15,18 +16,33 @@ module Main
 | 
				
			|||||||
      classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
 | 
					      classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      let intClasses = [0..length classes - 1]
 | 
					      let intClasses = [0..length classes - 1]
 | 
				
			||||||
          documents = createDocuments classes dataset
 | 
					      -- let intClasses = [0, 1]
 | 
				
			||||||
          testDocuments = createDocuments classes test
 | 
					          documents = cleanDocuments $ createDocuments classes dataset
 | 
				
			||||||
          devTestDocuments = take 20 testDocuments
 | 
					          -- documents = [Document "Chinese Beijing Chinese" 0,
 | 
				
			||||||
          nb = initialize documents
 | 
					          --              Document "Chinese Chinese Shanghai" 0,
 | 
				
			||||||
 | 
					          --              Document "Chinese Macao" 0,
 | 
				
			||||||
 | 
					          --              Document "Japan Tokyo Chinese" 1]
 | 
				
			||||||
 | 
					          -- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0]
 | 
				
			||||||
 | 
					          testDocuments = cleanDocuments $ createDocuments classes test
 | 
				
			||||||
 | 
					          devTestDocuments = take 30 testDocuments
 | 
				
			||||||
 | 
					          -- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0]
 | 
				
			||||||
 | 
					          nb = train documents intClasses
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          results = map (\(Document text c) -> (c, determine text nb intClasses documents)) testDocuments
 | 
					          results = map (\(Document text c) -> (c, run text nb)) testDocuments
 | 
				
			||||||
          -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
 | 
					          -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      -- print (text $ head documents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
 | 
					      let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r)
 | 
				
			||||||
      mapM_ showResults results
 | 
					      mapM_ showResults results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      putStrLn $ "Recall: " ++ show (recall results)
 | 
					      putStrLn $ "Recall: " ++ show (recall results)
 | 
				
			||||||
      putStrLn $ "Precision: " ++ show (precision results)
 | 
					      putStrLn $ "Precision: " ++ show (precision results)
 | 
				
			||||||
      putStrLn $ "F Measure: " ++ show (fmeasure (precision results) (recall results))
 | 
					      putStrLn $ "F Measure: " ++ show (fmeasure results)
 | 
				
			||||||
      putStrLn $ "Accuracy: " ++ show (accuracy results)
 | 
					      putStrLn $ "Accuracy: " ++ show (accuracy results)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    createDocuments classes content =
 | 
				
			||||||
 | 
					      let splitted = splitOn (replicate 10 '-' ++ "\n") content
 | 
				
			||||||
 | 
					          pairs = map ((head . lines) &&& (unwords . tail . lines)) splitted
 | 
				
			||||||
 | 
					          documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
 | 
				
			||||||
 | 
					      in documents
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										54
									
								
								examples/naivebayes-sentiment-analysis.hs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								examples/naivebayes-sentiment-analysis.hs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,54 @@
 | 
				
			|||||||
 | 
					{-# LANGUAGE BangPatterns #-}
 | 
				
			||||||
 | 
					module Main
 | 
				
			||||||
 | 
					  where
 | 
				
			||||||
 | 
					    -- import Sibe
 | 
				
			||||||
 | 
					    import Sibe.NaiveBayes
 | 
				
			||||||
 | 
					    import Text.Printf
 | 
				
			||||||
 | 
					    import Data.List
 | 
				
			||||||
 | 
					    import Data.Maybe
 | 
				
			||||||
 | 
					    import Debug.Trace
 | 
				
			||||||
 | 
					    import Data.List.Split
 | 
				
			||||||
 | 
					    import System.Directory
 | 
				
			||||||
 | 
					    import Control.DeepSeq
 | 
				
			||||||
 | 
					    import System.IO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    main = do
 | 
				
			||||||
 | 
					      putStr "Reading documents... "
 | 
				
			||||||
 | 
					      neg_documents <- createDocuments "examples/sentiment-analysis-data/train/neg/"
 | 
				
			||||||
 | 
					      pos_documents <- createDocuments "examples/sentiment-analysis-data/train/pos/"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      test_neg_documents  <- createDocuments "examples/sentiment-analysis-data/test/neg/"
 | 
				
			||||||
 | 
					      test_pos_documents <- createDocuments "examples/sentiment-analysis-data/test/pos/"
 | 
				
			||||||
 | 
					      putStrLn "done"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      let classes = [0..9] -- rating, from 0 to 9 (1 to 10)
 | 
				
			||||||
 | 
					          documents = neg_documents ++ pos_documents
 | 
				
			||||||
 | 
					          nb = train documents classes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					          testDocuments = neg_documents ++ pos_documents
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					          results = map (\(Document text c) -> (c, run text nb)) testDocuments
 | 
				
			||||||
 | 
					          -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
 | 
				
			||||||
 | 
					      print results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      -- let showResults (c, r) = putStrLn (show (classes !! c) ++ " ~ " ++ show (classes !! r))
 | 
				
			||||||
 | 
					      -- mapM_ showResults results
 | 
				
			||||||
 | 
					      --
 | 
				
			||||||
 | 
					      -- putStrLn $ "Recall: " ++ show (recall results)
 | 
				
			||||||
 | 
					      -- putStrLn $ "Precision: " ++ show (precision results)
 | 
				
			||||||
 | 
					      -- putStrLn $ "F Measure: " ++ show (fmeasure results)
 | 
				
			||||||
 | 
					      -- putStrLn $ "Accuracy: " ++ show (accuracy results)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    createDocuments :: FilePath -> IO [Document]
 | 
				
			||||||
 | 
					    createDocuments path = do
 | 
				
			||||||
 | 
					      files <- drop 2 <$> getDirectoryContents path
 | 
				
			||||||
 | 
					      let ratings = map (subtract 1 . read . take 1 . last . splitOn "_") files :: [Int]
 | 
				
			||||||
 | 
					      contents <- mapM (forceReadFile . (path ++)) files
 | 
				
			||||||
 | 
					      return $ zipWith Document contents ratings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    forceReadFile :: FilePath -> IO String
 | 
				
			||||||
 | 
					    forceReadFile file = do
 | 
				
			||||||
 | 
					      handle <- openFile file ReadMode
 | 
				
			||||||
 | 
					      content <- hGetContents handle
 | 
				
			||||||
 | 
					      content `deepseq` hClose handle
 | 
				
			||||||
 | 
					      return content
 | 
				
			||||||
							
								
								
									
										1
									
								
								examples/sentiment-analysis-data
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								examples/sentiment-analysis-data
									
									
									
									
									
										Symbolic link
									
								
							@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					../../sibe-repos/sentiment-analysis-data
 | 
				
			||||||
@@ -7,7 +7,7 @@ module Main where
 | 
				
			|||||||
  main = do
 | 
					  main = do
 | 
				
			||||||
    let learning_rate = 0.5
 | 
					    let learning_rate = 0.5
 | 
				
			||||||
        (iterations, epochs) = (2, 1000)
 | 
					        (iterations, epochs) = (2, 1000)
 | 
				
			||||||
        a = (logistic, logistic')
 | 
					        a = (sigmoid, sigmoid')
 | 
				
			||||||
        rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output
 | 
					        rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]]
 | 
					        inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]]
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										6
									
								
								profiling/run
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								profiling/run
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,6 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PROG==geniconvert
 | 
				
			||||||
 | 
					VIEW==open
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					stack build --profile
 | 
				
			||||||
							
								
								
									
										6
									
								
								profiling/setup
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								profiling/setup
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,6 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					chmod u+x profiling/setup
 | 
				
			||||||
 | 
					chmod u+x profiling/run
 | 
				
			||||||
 | 
					chmod u+x profiling/compare
 | 
				
			||||||
 | 
					chmod u+x profiling/save
 | 
				
			||||||
							
								
								
									
										17
									
								
								sibe.cabal
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								sibe.cabal
									
									
									
									
									
								
							@@ -22,6 +22,10 @@ library
 | 
				
			|||||||
                     , deepseq
 | 
					                     , deepseq
 | 
				
			||||||
                     , containers
 | 
					                     , containers
 | 
				
			||||||
                     , split
 | 
					                     , split
 | 
				
			||||||
 | 
					                     , regex-base
 | 
				
			||||||
 | 
					                     , regex-pcre
 | 
				
			||||||
 | 
					                     , text
 | 
				
			||||||
 | 
					                     , stemmer
 | 
				
			||||||
  default-language:    Haskell2010
 | 
					  default-language:    Haskell2010
 | 
				
			||||||
 | 
					
 | 
				
			||||||
executable sibe-exe
 | 
					executable sibe-exe
 | 
				
			||||||
@@ -53,6 +57,19 @@ executable example-naivebayes-doc-classifier
 | 
				
			|||||||
                     , split
 | 
					                     , split
 | 
				
			||||||
  default-language:    Haskell2010
 | 
					  default-language:    Haskell2010
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					executable example-naivebayes-sentiment-analysis
 | 
				
			||||||
 | 
					  hs-source-dirs:      examples
 | 
				
			||||||
 | 
					  main-is:             naivebayes-sentiment-analysis.hs
 | 
				
			||||||
 | 
					  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
 | 
				
			||||||
 | 
					  build-depends:       base
 | 
				
			||||||
 | 
					                     , sibe
 | 
				
			||||||
 | 
					                     , hmatrix
 | 
				
			||||||
 | 
					                     , containers
 | 
				
			||||||
 | 
					                     , split
 | 
				
			||||||
 | 
					                     , directory
 | 
				
			||||||
 | 
					                     , deepseq
 | 
				
			||||||
 | 
					  default-language:    Haskell2010
 | 
				
			||||||
 | 
					
 | 
				
			||||||
test-suite sibe-test
 | 
					test-suite sibe-test
 | 
				
			||||||
  type:                exitcode-stdio-1.0
 | 
					  type:                exitcode-stdio-1.0
 | 
				
			||||||
  hs-source-dirs:      test
 | 
					  hs-source-dirs:      test
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										29
									
								
								src/Sibe.hs
									
									
									
									
									
								
							
							
						
						
									
										29
									
								
								src/Sibe.hs
									
									
									
									
									
								
							@@ -17,8 +17,10 @@ module Sibe
 | 
				
			|||||||
     train,
 | 
					     train,
 | 
				
			||||||
     session,
 | 
					     session,
 | 
				
			||||||
     shuffle,
 | 
					     shuffle,
 | 
				
			||||||
     logistic,
 | 
					     sigmoid,
 | 
				
			||||||
     logistic',
 | 
					     sigmoid',
 | 
				
			||||||
 | 
					     relu,
 | 
				
			||||||
 | 
					     relu',
 | 
				
			||||||
     crossEntropy,
 | 
					     crossEntropy,
 | 
				
			||||||
     genSeed,
 | 
					     genSeed,
 | 
				
			||||||
     replaceVector
 | 
					     replaceVector
 | 
				
			||||||
@@ -88,11 +90,17 @@ module Sibe
 | 
				
			|||||||
        randomLayer seed (input, h) a :-
 | 
					        randomLayer seed (input, h) a :-
 | 
				
			||||||
        randomNetwork (seed + 1) h hs output
 | 
					        randomNetwork (seed + 1) h hs output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      logistic :: Vector Double -> Vector Double
 | 
					      sigmoid :: Vector Double -> Vector Double
 | 
				
			||||||
      logistic x = 1 / (1 + exp (-x))
 | 
					      sigmoid x = 1 / max (1 + exp (-x)) 1e-10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      logistic' :: Vector Double -> Vector Double
 | 
					      sigmoid' :: Vector Double -> Vector Double
 | 
				
			||||||
      logistic' x = logistic x * (1 - logistic x)
 | 
					      sigmoid' x = sigmoid x * (1 - sigmoid x)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      relu :: Vector Double -> Vector Double
 | 
				
			||||||
 | 
					      relu x = log (max (1 + exp x) 1e-10)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      relu' :: Vector Double -> Vector Double
 | 
				
			||||||
 | 
					      relu' = sigmoid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      crossEntropy :: Output -> Output -> Double
 | 
					      crossEntropy :: Output -> Output -> Double
 | 
				
			||||||
      crossEntropy output target =
 | 
					      crossEntropy output target =
 | 
				
			||||||
@@ -100,7 +108,7 @@ module Sibe
 | 
				
			|||||||
            n = fromIntegral (length pairs)
 | 
					            n = fromIntegral (length pairs)
 | 
				
			||||||
        in (-1 / n) * sum (map f pairs)
 | 
					        in (-1 / n) * sum (map f pairs)
 | 
				
			||||||
        where
 | 
					        where
 | 
				
			||||||
          f (a, y) = y * log a + (1 - y) * log (1 - a)
 | 
					          f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      train :: Input
 | 
					      train :: Input
 | 
				
			||||||
            -> Network
 | 
					            -> Network
 | 
				
			||||||
@@ -114,8 +122,8 @@ module Sibe
 | 
				
			|||||||
            let y = runLayer input l
 | 
					            let y = runLayer input l
 | 
				
			||||||
                o = fn y
 | 
					                o = fn y
 | 
				
			||||||
                delta = o - target
 | 
					                delta = o - target
 | 
				
			||||||
                -- de = delta * fn' y -- quadratic cost
 | 
					                de = delta * fn' y
 | 
				
			||||||
                de = delta -- cross entropy cost
 | 
					                -- de = delta -- cross entropy cost
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                biases'  = biases  - scale alpha de
 | 
					                biases'  = biases  - scale alpha de
 | 
				
			||||||
                weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
 | 
					                weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
 | 
				
			||||||
@@ -174,3 +182,6 @@ module Sibe
 | 
				
			|||||||
          rrow index (x:xs)
 | 
					          rrow index (x:xs)
 | 
				
			||||||
            | index == index = value:xs
 | 
					            | index == index = value:xs
 | 
				
			||||||
            | otherwise = x : rrow (index + 1) xs
 | 
					            | otherwise = x : rrow (index + 1) xs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      clip :: Double -> (Double, Double) -> Double
 | 
				
			||||||
 | 
					      clip x (l, u) = min u (max l x)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,15 +1,17 @@
 | 
				
			|||||||
module Sibe.NaiveBayes
 | 
					module Sibe.NaiveBayes
 | 
				
			||||||
  (Document(..),
 | 
					  (Document(..),
 | 
				
			||||||
   NB(..),
 | 
					   NB(..),
 | 
				
			||||||
   createDocuments,
 | 
					   train,
 | 
				
			||||||
   initialize,
 | 
					   run,
 | 
				
			||||||
   calculate,
 | 
					 | 
				
			||||||
   determine,
 | 
					 | 
				
			||||||
   ordNub,
 | 
					   ordNub,
 | 
				
			||||||
   accuracy,
 | 
					   accuracy,
 | 
				
			||||||
   precision,
 | 
					   precision,
 | 
				
			||||||
   recall,
 | 
					   recall,
 | 
				
			||||||
   fmeasure,
 | 
					   fmeasure,
 | 
				
			||||||
 | 
					   mean,
 | 
				
			||||||
 | 
					   stdev,
 | 
				
			||||||
 | 
					   cleanText,
 | 
				
			||||||
 | 
					   cleanDocuments,
 | 
				
			||||||
  )
 | 
					  )
 | 
				
			||||||
  where
 | 
					  where
 | 
				
			||||||
    import Data.List
 | 
					    import Data.List
 | 
				
			||||||
@@ -18,47 +20,126 @@ module Sibe.NaiveBayes
 | 
				
			|||||||
    import Data.List.Split
 | 
					    import Data.List.Split
 | 
				
			||||||
    import Data.Maybe
 | 
					    import Data.Maybe
 | 
				
			||||||
    import Control.Arrow ((&&&))
 | 
					    import Control.Arrow ((&&&))
 | 
				
			||||||
    type Class = Int
 | 
					    import Text.Regex.PCRE
 | 
				
			||||||
 | 
					    import Data.Char (isSpace)
 | 
				
			||||||
 | 
					    import NLP.Stemmer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    type Class = Int;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data Document = Document { text :: String
 | 
					    data Document = Document { text :: String
 | 
				
			||||||
                             , c    :: Class
 | 
					                             , c    :: Class
 | 
				
			||||||
                             } deriving (Eq, Show, Read)
 | 
					                             } deriving (Eq, Show, Read)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data NB = NB { vocabulary :: Double
 | 
					    data NB = NB { documents  :: [Document]
 | 
				
			||||||
 | 
					                 , classes    :: [(Class, Double)]
 | 
				
			||||||
 | 
					                 , vocabulary :: Int
 | 
				
			||||||
                 , megadoc    :: String
 | 
					                 , megadoc    :: String
 | 
				
			||||||
                 }
 | 
					                 , cd         :: [(Class, [Document])]
 | 
				
			||||||
 | 
					                 , cw         :: [(Class, [(String, Int)])]
 | 
				
			||||||
 | 
					                 } deriving (Eq, Show, Read)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    initialize :: [Document] -> NB
 | 
					    train :: [Document] -> [Class] -> NB
 | 
				
			||||||
    initialize documents =
 | 
					    train documents classes =
 | 
				
			||||||
      let megadoc = concatMap (\(Document text _) -> text ++ " ") documents
 | 
					      let megadoc = concatDocs documents
 | 
				
			||||||
          vocabulary = genericLength ((ordNub . words) megadoc)
 | 
					          vocabulary = genericLength ((ordNub . words) megadoc)
 | 
				
			||||||
      in NB vocabulary megadoc
 | 
					          -- (class, prior probability)
 | 
				
			||||||
 | 
					          cls = zip classes (map classPrior classes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    determine :: String -> NB -> [Class] -> [Document] -> Class
 | 
					          -- (class, [document])
 | 
				
			||||||
    determine text nb classes documents =
 | 
					          cd = zip classes (map classDocs classes)
 | 
				
			||||||
      let scores = zip [0..] (map (\cls -> calculate text nb cls documents) classes)
 | 
					 | 
				
			||||||
          m = maximumBy (\(i0, c0) (i1, c1) -> c0 `compare` c1) scores
 | 
					 | 
				
			||||||
      in fst m
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    calculate :: String -> NB -> Class -> [Document] -> Double
 | 
					          -- (class, [(word, count)])
 | 
				
			||||||
    calculate text (NB vocabulary megadoc) cls documents =
 | 
					          cw = zip classes $ l (map classWordsCounts classes)
 | 
				
			||||||
      let docs = filter (\(Document text c) -> c == cls) documents
 | 
					
 | 
				
			||||||
          texts = map (\(Document text _) -> text ++ " ") docs
 | 
					      in NB { documents  = documents
 | 
				
			||||||
          classText = concat texts
 | 
					            , classes    = cls
 | 
				
			||||||
          classWords = words classText
 | 
					            , vocabulary = vocabulary
 | 
				
			||||||
          c = genericLength classWords
 | 
					            , megadoc    = megadoc
 | 
				
			||||||
          pc = genericLength docs / genericLength documents
 | 
					            , cd         = cd
 | 
				
			||||||
      in pc * product (map (cword classWords c) (words text))
 | 
					            , cw         = cw
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
      where
 | 
					      where
 | 
				
			||||||
        cword classWords c word =
 | 
					        concatDocs = concatMap (\(Document text _) -> text ++ " ")
 | 
				
			||||||
          let wc = genericLength (filter (==word) classWords)
 | 
					 | 
				
			||||||
          in (wc + 1) / (c + vocabulary)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    createDocuments classes content =
 | 
					        classDocs x = filter ((==x) . c) documents
 | 
				
			||||||
      let splitted = splitOn (replicate 10 '-' ++ "\n") content
 | 
					        classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x)
 | 
				
			||||||
          pairs = map ((head . lines) &&& (concat . tail . lines)) splitted
 | 
					        classWords x = words (classMegadoc x)
 | 
				
			||||||
          documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs
 | 
					        classNGram n = ngram n . classMegadoc
 | 
				
			||||||
      in documents
 | 
					        classVocabulary x = ordNub (classWords x)
 | 
				
			||||||
 | 
					        classPrior x = genericLength (classDocs x) / genericLength documents
 | 
				
			||||||
 | 
					        countWordInDoc d w = genericLength (filter (==w) d)
 | 
				
			||||||
 | 
					        classWordsCounts x =
 | 
				
			||||||
 | 
					          let voc = classVocabulary x
 | 
				
			||||||
 | 
					          in zip voc $ map (countWordInDoc (classWords x)) voc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ngram :: Int -> String -> [String]
 | 
				
			||||||
 | 
					    ngram n text =
 | 
				
			||||||
 | 
					      let ws = words text
 | 
				
			||||||
 | 
					      in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    run :: String -> NB -> Class
 | 
				
			||||||
 | 
					    run text (NB documents classes vocabulary megadoc cd cw) =
 | 
				
			||||||
 | 
					      let scores = map (score . fst) classes
 | 
				
			||||||
 | 
					      in argmax scores
 | 
				
			||||||
 | 
					      where
 | 
				
			||||||
 | 
					        score c =
 | 
				
			||||||
 | 
					          let prior = snd (classes !! c)
 | 
				
			||||||
 | 
					          in prior * product (map (prob c) (words text))
 | 
				
			||||||
 | 
					        prob c w =
 | 
				
			||||||
 | 
					          let tctM = find ((==w) . fst) (snd (cw !! c))
 | 
				
			||||||
 | 
					              tct  = (snd . fromJust) tctM
 | 
				
			||||||
 | 
					              cvoc = (genericLength . snd) (cw !! c)
 | 
				
			||||||
 | 
					              voc  = vocabulary
 | 
				
			||||||
 | 
					          in
 | 
				
			||||||
 | 
					            if isJust tctM then
 | 
				
			||||||
 | 
					              realToFrac (tct + 1) / realToFrac (cvoc + voc)
 | 
				
			||||||
 | 
					            else
 | 
				
			||||||
 | 
					              1 / realToFrac (cvoc + voc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    argmax :: (Ord a) => [a] -> Int
 | 
				
			||||||
 | 
					    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mean :: [Double] -> Double
 | 
				
			||||||
 | 
					    mean x = sum x / genericLength x
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    stdev :: [Double] -> Double
 | 
				
			||||||
 | 
					    stdev x =
 | 
				
			||||||
 | 
					      let avg = mean x
 | 
				
			||||||
 | 
					          variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
 | 
				
			||||||
 | 
					      in sqrt variance
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cleanText :: String -> String
 | 
				
			||||||
 | 
					    cleanText string =
 | 
				
			||||||
 | 
					      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
 | 
				
			||||||
 | 
					          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
 | 
				
			||||||
 | 
					          stemmed = unwords $ map (stem Porter) (words spacify)
 | 
				
			||||||
 | 
					      in stemmed
 | 
				
			||||||
 | 
					      where
 | 
				
			||||||
 | 
					        trim = f . f
 | 
				
			||||||
 | 
					          where
 | 
				
			||||||
 | 
					            f = reverse . dropWhile isSpace
 | 
				
			||||||
 | 
					        replace needle replacement =
 | 
				
			||||||
 | 
					          map (\c -> if c == needle then replacement else c)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cleanDocuments :: [Document] -> [Document]
 | 
				
			||||||
 | 
					    cleanDocuments documents =
 | 
				
			||||||
 | 
					      let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
 | 
				
			||||||
 | 
					          wc = wordCounts (concatDocs cleaned)
 | 
				
			||||||
 | 
					          wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
 | 
				
			||||||
 | 
					          stopwords = l $ map fst (take 30 wlist)
 | 
				
			||||||
 | 
					          wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned
 | 
				
			||||||
 | 
					      in wstopwords
 | 
				
			||||||
 | 
					      where
 | 
				
			||||||
 | 
					        vocabulary x = ordNub (words x)
 | 
				
			||||||
 | 
					        countWordInDoc d w = genericLength (filter (==w) d)
 | 
				
			||||||
 | 
					        wordCounts x =
 | 
				
			||||||
 | 
					          let voc = vocabulary x
 | 
				
			||||||
 | 
					          in zip voc $ map (countWordInDoc (words x)) voc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        removeWords list text =
 | 
				
			||||||
 | 
					          unwords $ filter (`notElem` list) (words text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        concatDocs = concatMap (\(Document text _) -> text ++ " ")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    l :: (Show a) => a -> a
 | 
					    l :: (Show a) => a -> a
 | 
				
			||||||
    l a = trace (show a) a
 | 
					    l a = trace (show a) a
 | 
				
			||||||
@@ -100,5 +181,8 @@ module Sibe.NaiveBayes
 | 
				
			|||||||
              then 0
 | 
					              then 0
 | 
				
			||||||
              else t / y
 | 
					              else t / y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fmeasure :: Double -> Double -> Double
 | 
					    fmeasure :: [(Int, Int)] -> Double
 | 
				
			||||||
    fmeasure r p = (2 * p * r) / (p + r)
 | 
					    fmeasure results =
 | 
				
			||||||
 | 
					      let r = recall results
 | 
				
			||||||
 | 
					          p = precision results
 | 
				
			||||||
 | 
					      in (2 * p * r) / (p + r)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -41,7 +41,10 @@ packages:
 | 
				
			|||||||
    commit: 42a88fbcb6bd1d2c4dc18fae5e962bd34fb316a1
 | 
					    commit: 42a88fbcb6bd1d2c4dc18fae5e962bd34fb316a1
 | 
				
			||||||
  subdirs:
 | 
					  subdirs:
 | 
				
			||||||
    - packages/base
 | 
					    - packages/base
 | 
				
			||||||
- '.'
 | 
					- .
 | 
				
			||||||
 | 
					- http://hackage.haskell.org/package/containers-0.5.7.1/containers-0.5.7.1.tar.gz
 | 
				
			||||||
 | 
					- http://hackage.haskell.org/package/text-1.2.2.1/text-1.2.2.1.tar.gz
 | 
				
			||||||
 | 
					- http://hackage.haskell.org/package/stemmer-0.5.2/stemmer-0.5.2.tar.gz
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Dependency packages to be pulled from upstream that are not in the resolver
 | 
					# Dependency packages to be pulled from upstream that are not in the resolver
 | 
				
			||||||
# (e.g., acme-missiles-0.3)
 | 
					# (e.g., acme-missiles-0.3)
 | 
				
			||||||
@@ -70,3 +73,5 @@ extra-package-dbs: []
 | 
				
			|||||||
#
 | 
					#
 | 
				
			||||||
# Allow a newer minor version of GHC than the snapshot specifies
 | 
					# Allow a newer minor version of GHC than the snapshot specifies
 | 
				
			||||||
# compiler-check: newer-minor
 | 
					# compiler-check: newer-minor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					system-ghc: false
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user