From ea1f05f00181d0e6265bc6e5e25075b38b3b1ba3 Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Fri, 5 Aug 2016 23:54:36 +0430 Subject: [PATCH] fix(naivebayes): fix the algorithm to make it actually work feat(cleanDocuments): preprocess documents, use stemming and stopword elimination for better accuracy --- examples/naivebayes-doc-classifier.hs | 32 +++-- examples/naivebayes-sentiment-analysis.hs | 54 ++++++++ examples/sentiment-analysis-data | 1 + examples/xor.hs | 2 +- profiling/run | 6 + profiling/setup | 6 + sibe.cabal | 17 +++ src/Sibe.hs | 29 ++-- src/Sibe/NaiveBayes.hs | 154 +++++++++++++++++----- stack.yaml | 7 +- 10 files changed, 254 insertions(+), 54 deletions(-) create mode 100644 examples/naivebayes-sentiment-analysis.hs create mode 120000 examples/sentiment-analysis-data create mode 100644 profiling/run create mode 100644 profiling/setup diff --git a/examples/naivebayes-doc-classifier.hs b/examples/naivebayes-doc-classifier.hs index 82becbf..f842e8c 100644 --- a/examples/naivebayes-doc-classifier.hs +++ b/examples/naivebayes-doc-classifier.hs @@ -1,12 +1,13 @@ module Main where - import Sibe + -- import Sibe import Sibe.NaiveBayes import Text.Printf import Data.List import Data.Maybe import Debug.Trace import Data.List.Split + import Control.Arrow ((&&&)) main = do dataset <- readFile "examples/doc-classifier-data/data-reuters" @@ -15,18 +16,33 @@ module Main classes <- map (filter (/= ' ')) . lines <$> readFile "examples/doc-classifier-data/data-classes" let intClasses = [0..length classes - 1] - documents = createDocuments classes dataset - testDocuments = createDocuments classes test - devTestDocuments = take 20 testDocuments - nb = initialize documents + -- let intClasses = [0, 1] + documents = cleanDocuments $ createDocuments classes dataset + -- documents = [Document "Chinese Beijing Chinese" 0, + -- Document "Chinese Chinese Shanghai" 0, + -- Document "Chinese Macao" 0, + -- Document "Japan Tokyo Chinese" 1] + -- testDocuments = [Document "Chinese Chinese Chinese Japan Tokyo" 0] + testDocuments = cleanDocuments $ createDocuments classes test + devTestDocuments = take 30 testDocuments + -- devTestDocuments = [Document "Chinese Chinese Chinese Tokyo Japan" 0] + nb = train documents intClasses - results = map (\(Document text c) -> (c, determine text nb intClasses documents)) testDocuments - -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments + results = map (\(Document text c) -> (c, run text nb)) testDocuments + -- results = map (\(Document text c) -> (c, run text nb)) devTestDocuments + + -- print (text $ head documents) let showResults (c, r) = putStrLn (classes !! c ++ " ~ " ++ classes !! r) mapM_ showResults results putStrLn $ "Recall: " ++ show (recall results) putStrLn $ "Precision: " ++ show (precision results) - putStrLn $ "F Measure: " ++ show (fmeasure (precision results) (recall results)) + putStrLn $ "F Measure: " ++ show (fmeasure results) putStrLn $ "Accuracy: " ++ show (accuracy results) + + createDocuments classes content = + let splitted = splitOn (replicate 10 '-' ++ "\n") content + pairs = map ((head . lines) &&& (unwords . tail . lines)) splitted + documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs + in documents diff --git a/examples/naivebayes-sentiment-analysis.hs b/examples/naivebayes-sentiment-analysis.hs new file mode 100644 index 0000000..b97f83d --- /dev/null +++ b/examples/naivebayes-sentiment-analysis.hs @@ -0,0 +1,54 @@ +{-# LANGUAGE BangPatterns #-} +module Main + where + -- import Sibe + import Sibe.NaiveBayes + import Text.Printf + import Data.List + import Data.Maybe + import Debug.Trace + import Data.List.Split + import System.Directory + import Control.DeepSeq + import System.IO + + main = do + putStr "Reading documents... " + neg_documents <- createDocuments "examples/sentiment-analysis-data/train/neg/" + pos_documents <- createDocuments "examples/sentiment-analysis-data/train/pos/" + + test_neg_documents <- createDocuments "examples/sentiment-analysis-data/test/neg/" + test_pos_documents <- createDocuments "examples/sentiment-analysis-data/test/pos/" + putStrLn "done" + + let classes = [0..9] -- rating, from 0 to 9 (1 to 10) + documents = neg_documents ++ pos_documents + nb = train documents classes + + testDocuments = neg_documents ++ pos_documents + + results = map (\(Document text c) -> (c, run text nb)) testDocuments + -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments + print results + + -- let showResults (c, r) = putStrLn (show (classes !! c) ++ " ~ " ++ show (classes !! r)) + -- mapM_ showResults results + -- + -- putStrLn $ "Recall: " ++ show (recall results) + -- putStrLn $ "Precision: " ++ show (precision results) + -- putStrLn $ "F Measure: " ++ show (fmeasure results) + -- putStrLn $ "Accuracy: " ++ show (accuracy results) + + createDocuments :: FilePath -> IO [Document] + createDocuments path = do + files <- drop 2 <$> getDirectoryContents path + let ratings = map (subtract 1 . read . take 1 . last . splitOn "_") files :: [Int] + contents <- mapM (forceReadFile . (path ++)) files + return $ zipWith Document contents ratings + + forceReadFile :: FilePath -> IO String + forceReadFile file = do + handle <- openFile file ReadMode + content <- hGetContents handle + content `deepseq` hClose handle + return content diff --git a/examples/sentiment-analysis-data b/examples/sentiment-analysis-data new file mode 120000 index 0000000..dbd75b3 --- /dev/null +++ b/examples/sentiment-analysis-data @@ -0,0 +1 @@ +../../sibe-repos/sentiment-analysis-data \ No newline at end of file diff --git a/examples/xor.hs b/examples/xor.hs index f58acba..5ed8d27 100644 --- a/examples/xor.hs +++ b/examples/xor.hs @@ -7,7 +7,7 @@ module Main where main = do let learning_rate = 0.5 (iterations, epochs) = (2, 1000) - a = (logistic, logistic') + a = (sigmoid, sigmoid') rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]] diff --git a/profiling/run b/profiling/run new file mode 100644 index 0000000..b6c2ad8 --- /dev/null +++ b/profiling/run @@ -0,0 +1,6 @@ +#!/bin/bash + +PROG==geniconvert +VIEW==open + +stack build --profile diff --git a/profiling/setup b/profiling/setup new file mode 100644 index 0000000..feb97cd --- /dev/null +++ b/profiling/setup @@ -0,0 +1,6 @@ +#!/bin/bash + +chmod u+x profiling/setup +chmod u+x profiling/run +chmod u+x profiling/compare +chmod u+x profiling/save diff --git a/sibe.cabal b/sibe.cabal index 4a74e99..d401501 100644 --- a/sibe.cabal +++ b/sibe.cabal @@ -22,6 +22,10 @@ library , deepseq , containers , split + , regex-base + , regex-pcre + , text + , stemmer default-language: Haskell2010 executable sibe-exe @@ -53,6 +57,19 @@ executable example-naivebayes-doc-classifier , split default-language: Haskell2010 +executable example-naivebayes-sentiment-analysis + hs-source-dirs: examples + main-is: naivebayes-sentiment-analysis.hs + ghc-options: -threaded -rtsopts -with-rtsopts=-N + build-depends: base + , sibe + , hmatrix + , containers + , split + , directory + , deepseq + default-language: Haskell2010 + test-suite sibe-test type: exitcode-stdio-1.0 hs-source-dirs: test diff --git a/src/Sibe.hs b/src/Sibe.hs index 5f78556..79d4f08 100644 --- a/src/Sibe.hs +++ b/src/Sibe.hs @@ -17,8 +17,10 @@ module Sibe train, session, shuffle, - logistic, - logistic', + sigmoid, + sigmoid', + relu, + relu', crossEntropy, genSeed, replaceVector @@ -88,11 +90,17 @@ module Sibe randomLayer seed (input, h) a :- randomNetwork (seed + 1) h hs output - logistic :: Vector Double -> Vector Double - logistic x = 1 / (1 + exp (-x)) + sigmoid :: Vector Double -> Vector Double + sigmoid x = 1 / max (1 + exp (-x)) 1e-10 - logistic' :: Vector Double -> Vector Double - logistic' x = logistic x * (1 - logistic x) + sigmoid' :: Vector Double -> Vector Double + sigmoid' x = sigmoid x * (1 - sigmoid x) + + relu :: Vector Double -> Vector Double + relu x = log (max (1 + exp x) 1e-10) + + relu' :: Vector Double -> Vector Double + relu' = sigmoid crossEntropy :: Output -> Output -> Double crossEntropy output target = @@ -100,7 +108,7 @@ module Sibe n = fromIntegral (length pairs) in (-1 / n) * sum (map f pairs) where - f (a, y) = y * log a + (1 - y) * log (1 - a) + f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10) train :: Input -> Network @@ -114,8 +122,8 @@ module Sibe let y = runLayer input l o = fn y delta = o - target - -- de = delta * fn' y -- quadratic cost - de = delta -- cross entropy cost + de = delta * fn' y + -- de = delta -- cross entropy cost biases' = biases - scale alpha de weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly @@ -174,3 +182,6 @@ module Sibe rrow index (x:xs) | index == index = value:xs | otherwise = x : rrow (index + 1) xs + + clip :: Double -> (Double, Double) -> Double + clip x (l, u) = min u (max l x) diff --git a/src/Sibe/NaiveBayes.hs b/src/Sibe/NaiveBayes.hs index 7a7cd05..1521a61 100644 --- a/src/Sibe/NaiveBayes.hs +++ b/src/Sibe/NaiveBayes.hs @@ -1,15 +1,17 @@ module Sibe.NaiveBayes (Document(..), NB(..), - createDocuments, - initialize, - calculate, - determine, + train, + run, ordNub, accuracy, precision, recall, fmeasure, + mean, + stdev, + cleanText, + cleanDocuments, ) where import Data.List @@ -18,47 +20,126 @@ module Sibe.NaiveBayes import Data.List.Split import Data.Maybe import Control.Arrow ((&&&)) - type Class = Int + import Text.Regex.PCRE + import Data.Char (isSpace) + import NLP.Stemmer + + type Class = Int; data Document = Document { text :: String , c :: Class } deriving (Eq, Show, Read) - data NB = NB { vocabulary :: Double + data NB = NB { documents :: [Document] + , classes :: [(Class, Double)] + , vocabulary :: Int , megadoc :: String - } + , cd :: [(Class, [Document])] + , cw :: [(Class, [(String, Int)])] + } deriving (Eq, Show, Read) - initialize :: [Document] -> NB - initialize documents = - let megadoc = concatMap (\(Document text _) -> text ++ " ") documents + train :: [Document] -> [Class] -> NB + train documents classes = + let megadoc = concatDocs documents vocabulary = genericLength ((ordNub . words) megadoc) - in NB vocabulary megadoc + -- (class, prior probability) + cls = zip classes (map classPrior classes) - determine :: String -> NB -> [Class] -> [Document] -> Class - determine text nb classes documents = - let scores = zip [0..] (map (\cls -> calculate text nb cls documents) classes) - m = maximumBy (\(i0, c0) (i1, c1) -> c0 `compare` c1) scores - in fst m + -- (class, [document]) + cd = zip classes (map classDocs classes) - calculate :: String -> NB -> Class -> [Document] -> Double - calculate text (NB vocabulary megadoc) cls documents = - let docs = filter (\(Document text c) -> c == cls) documents - texts = map (\(Document text _) -> text ++ " ") docs - classText = concat texts - classWords = words classText - c = genericLength classWords - pc = genericLength docs / genericLength documents - in pc * product (map (cword classWords c) (words text)) + -- (class, [(word, count)]) + cw = zip classes $ l (map classWordsCounts classes) + + in NB { documents = documents + , classes = cls + , vocabulary = vocabulary + , megadoc = megadoc + , cd = cd + , cw = cw + } where - cword classWords c word = - let wc = genericLength (filter (==word) classWords) - in (wc + 1) / (c + vocabulary) + concatDocs = concatMap (\(Document text _) -> text ++ " ") - createDocuments classes content = - let splitted = splitOn (replicate 10 '-' ++ "\n") content - pairs = map ((head . lines) &&& (concat . tail . lines)) splitted - documents = map (\(topic, text) -> Document text (fromJust $ elemIndex topic classes)) pairs - in documents + classDocs x = filter ((==x) . c) documents + classMegadoc x = concatMap (\(Document text _) -> text ++ " ") (classDocs x) + classWords x = words (classMegadoc x) + classNGram n = ngram n . classMegadoc + classVocabulary x = ordNub (classWords x) + classPrior x = genericLength (classDocs x) / genericLength documents + countWordInDoc d w = genericLength (filter (==w) d) + classWordsCounts x = + let voc = classVocabulary x + in zip voc $ map (countWordInDoc (classWords x)) voc + + ngram :: Int -> String -> [String] + ngram n text = + let ws = words text + in map (\(i, w) -> unwords $ w:((take (n - 1) . drop (i+1)) ws)) (zip [0..] ws) + + run :: String -> NB -> Class + run text (NB documents classes vocabulary megadoc cd cw) = + let scores = map (score . fst) classes + in argmax scores + where + score c = + let prior = snd (classes !! c) + in prior * product (map (prob c) (words text)) + prob c w = + let tctM = find ((==w) . fst) (snd (cw !! c)) + tct = (snd . fromJust) tctM + cvoc = (genericLength . snd) (cw !! c) + voc = vocabulary + in + if isJust tctM then + realToFrac (tct + 1) / realToFrac (cvoc + voc) + else + 1 / realToFrac (cvoc + voc) + + argmax :: (Ord a) => [a] -> Int + argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x) + + mean :: [Double] -> Double + mean x = sum x / genericLength x + + stdev :: [Double] -> Double + stdev x = + let avg = mean x + variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1) + in sqrt variance + + cleanText :: String -> String + cleanText string = + let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string) + spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r'] + stemmed = unwords $ map (stem Porter) (words spacify) + in stemmed + where + trim = f . f + where + f = reverse . dropWhile isSpace + replace needle replacement = + map (\c -> if c == needle then replacement else c) + + cleanDocuments :: [Document] -> [Document] + cleanDocuments documents = + let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents + wc = wordCounts (concatDocs cleaned) + wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc + stopwords = l $ map fst (take 30 wlist) + wstopwords = map (\(Document text c) -> Document (removeWords stopwords text) c) cleaned + in wstopwords + where + vocabulary x = ordNub (words x) + countWordInDoc d w = genericLength (filter (==w) d) + wordCounts x = + let voc = vocabulary x + in zip voc $ map (countWordInDoc (words x)) voc + + removeWords list text = + unwords $ filter (`notElem` list) (words text) + + concatDocs = concatMap (\(Document text _) -> text ++ " ") l :: (Show a) => a -> a l a = trace (show a) a @@ -100,5 +181,8 @@ module Sibe.NaiveBayes then 0 else t / y - fmeasure :: Double -> Double -> Double - fmeasure r p = (2 * p * r) / (p + r) + fmeasure :: [(Int, Int)] -> Double + fmeasure results = + let r = recall results + p = precision results + in (2 * p * r) / (p + r) diff --git a/stack.yaml b/stack.yaml index 305964d..d02fe3e 100644 --- a/stack.yaml +++ b/stack.yaml @@ -41,7 +41,10 @@ packages: commit: 42a88fbcb6bd1d2c4dc18fae5e962bd34fb316a1 subdirs: - packages/base -- '.' +- . +- http://hackage.haskell.org/package/containers-0.5.7.1/containers-0.5.7.1.tar.gz +- http://hackage.haskell.org/package/text-1.2.2.1/text-1.2.2.1.tar.gz +- http://hackage.haskell.org/package/stemmer-0.5.2/stemmer-0.5.2.tar.gz # Dependency packages to be pulled from upstream that are not in the resolver # (e.g., acme-missiles-0.3) @@ -70,3 +73,5 @@ extra-package-dbs: [] # # Allow a newer minor version of GHC than the snapshot specifies # compiler-check: newer-minor + +system-ghc: false