diff --git a/.gitmodules b/.gitmodules index ff2a935..17f4304 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "examples/doc-classifier-data"] path = examples/doc-classifier-data url = git@github.com:mdibaiee/doc-classifier-data +[submodule "examples/notMNIST"] + path = examples/notMNIST + url = git@github.com:mdibaiee/notMNIST diff --git a/examples/424encoder.hs b/examples/424encoder.hs new file mode 100644 index 0000000..4b93941 --- /dev/null +++ b/examples/424encoder.hs @@ -0,0 +1,49 @@ +module Main where + import Sibe + import Numeric.LinearAlgebra + import Data.List + import Debug.Trace + import Data.Default.Class + + main = do + let alpha = 0.5 + epochs = 1000 + a = (sigmoid, sigmoid') + rnetwork = randomNetwork 0 (-0.1, 0.1) 4 [(2, a)] (4, a) + + inputs = [vector [1, 0, 0, 0], + vector [0, 1, 0, 0], + vector [0, 0, 1, 0], + vector [0, 0, 0, 1]] + + labels = [vector [1, 0, 0, 0], + vector [0, 1, 0, 0], + vector [0, 0, 1, 0], + vector [0, 0, 0, 1]] + + session = def { network = rnetwork + , learningRate = 0.5 + , epochs = 1000 + , training = zip inputs labels + , test = zip inputs labels + } :: Session + + let initialCost = crossEntropy session + + newsession <- run gd session + + let results = map (`forward` newsession) inputs + rounded = map (map round . toList) results + + cost = crossEntropy newsession + + putStrLn "parameters: " + putStrLn $ "- inputs: " ++ show inputs + putStrLn $ "- labels: " ++ show labels + putStrLn $ "- learning rate: " ++ show alpha + putStrLn $ "- epochs: " ++ show epochs + putStrLn $ "- initial cost (cross-entropy): " ++ show initialCost + putStrLn "results: " + putStrLn $ "- actual result: " ++ show results + putStrLn $ "- rounded result: " ++ show rounded + putStrLn $ "- cost (cross-entropy): " ++ show cost diff --git a/examples/naivebayes-doc-classifier.hs b/examples/naivebayes-doc-classifier.hs index f0d4985..0e7581e 100644 --- a/examples/naivebayes-doc-classifier.hs +++ b/examples/naivebayes-doc-classifier.hs @@ -1,6 +1,7 @@ module Main where -- import Sibe + import Sibe.NLP import Sibe.NaiveBayes import Text.Printf import Data.List @@ -28,14 +29,14 @@ module Main documents = cleanDocuments . removeWords sws $ createDocuments classes dataset testDocuments = cleanDocuments $ createDocuments classes test - nb = train documents intClasses + nb = initialize documents intClasses -- top-ten topClasses = take 10 . reverse $ sortBy (compare `on` (length . snd)) (cd nb) filtered = map (\(c, ds) -> (c, take 100 ds)) topClasses filteredClasses = map fst filtered ttDocs = concatMap snd filtered - ttNB = train ttDocs filteredClasses + ttNB = initialize ttDocs filteredClasses ttTestDocuments = filter ((`elem` filteredClasses) . c) . cleanDocuments $ createDocuments classes test diff --git a/examples/naivebayes-sentiment-analysis.hs b/examples/naivebayes-sentiment-analysis.hs deleted file mode 100644 index b97f83d..0000000 --- a/examples/naivebayes-sentiment-analysis.hs +++ /dev/null @@ -1,54 +0,0 @@ -{-# LANGUAGE BangPatterns #-} -module Main - where - -- import Sibe - import Sibe.NaiveBayes - import Text.Printf - import Data.List - import Data.Maybe - import Debug.Trace - import Data.List.Split - import System.Directory - import Control.DeepSeq - import System.IO - - main = do - putStr "Reading documents... " - neg_documents <- createDocuments "examples/sentiment-analysis-data/train/neg/" - pos_documents <- createDocuments "examples/sentiment-analysis-data/train/pos/" - - test_neg_documents <- createDocuments "examples/sentiment-analysis-data/test/neg/" - test_pos_documents <- createDocuments "examples/sentiment-analysis-data/test/pos/" - putStrLn "done" - - let classes = [0..9] -- rating, from 0 to 9 (1 to 10) - documents = neg_documents ++ pos_documents - nb = train documents classes - - testDocuments = neg_documents ++ pos_documents - - results = map (\(Document text c) -> (c, run text nb)) testDocuments - -- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments - print results - - -- let showResults (c, r) = putStrLn (show (classes !! c) ++ " ~ " ++ show (classes !! r)) - -- mapM_ showResults results - -- - -- putStrLn $ "Recall: " ++ show (recall results) - -- putStrLn $ "Precision: " ++ show (precision results) - -- putStrLn $ "F Measure: " ++ show (fmeasure results) - -- putStrLn $ "Accuracy: " ++ show (accuracy results) - - createDocuments :: FilePath -> IO [Document] - createDocuments path = do - files <- drop 2 <$> getDirectoryContents path - let ratings = map (subtract 1 . read . take 1 . last . splitOn "_") files :: [Int] - contents <- mapM (forceReadFile . (path ++)) files - return $ zipWith Document contents ratings - - forceReadFile :: FilePath -> IO String - forceReadFile file = do - handle <- openFile file ReadMode - content <- hGetContents handle - content `deepseq` hClose handle - return content diff --git a/examples/notMNIST b/examples/notMNIST new file mode 160000 index 0000000..0dbdfd4 --- /dev/null +++ b/examples/notMNIST @@ -0,0 +1 @@ +Subproject commit 0dbdfd43ffb8e90a3657ed040fd1fb3d25654b51 diff --git a/examples/notmnist.hs b/examples/notmnist.hs new file mode 100644 index 0000000..d913377 --- /dev/null +++ b/examples/notmnist.hs @@ -0,0 +1,111 @@ +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE ScopedTypeVariables #-} + +module Main where + import Sibe + import Numeric.LinearAlgebra + import Data.List + import Debug.Trace + import System.IO + import System.Directory + import Codec.Picture + import Codec.Picture.Types + import qualified Data.Vector.Storable as V + import Data.Either + import System.Random + import System.Random.Shuffle + import Data.Default.Class + + import qualified Graphics.Rendering.Chart.Easy as Chart + import Graphics.Rendering.Chart.Backend.Cairo + + main = do + setStdGen (mkStdGen 100) + + let a = (sigmoid, sigmoid') + o = (softmax, one) + rnetwork = randomNetwork 0 (-1, 1) (28*28) [(100, a)] (10, a) + + (inputs, labels) <- dataset + + let trp = length inputs * 70 `div` 100 + tep = length inputs * 30 `div` 100 + + -- training data + trinputs = take trp inputs + trlabels = take trp labels + + -- test data + teinputs = take tep . drop trp $ inputs + telabels = take tep . drop trp $ labels + + let session = def { learningRate = 0.5 + , batchSize = 32 + , epochs = 35 + , network = rnetwork + , training = zip trinputs trlabels + , test = zip teinputs telabels + } :: Session + + let initialCost = crossEntropy session + + newsession <- run (sgd . learningRateDecay (1.1, 5e-2)) session + + let el = map (\(e, l, _) -> (e, l)) (chart newsession) + ea = map (\(e, _, a) -> (e, a)) (chart newsession) + toFile Chart.def "notmnist.png" $ do + Chart.layoutlr_title Chart..= "loss over time" + Chart.plotLeft (Chart.line "loss" [el]) + Chart.plotRight (Chart.line "learningRate" [ea]) + + + let cost = crossEntropy newsession + + putStrLn "parameters: " + putStrLn $ "- batch size: " ++ show (batchSize session) + putStrLn $ "- learning rate: " ++ show (learningRate session) + putStrLn $ "- epochs: " ++ show (epochs session) + putStrLn $ "- initial cost (cross-entropy): " ++ show initialCost + putStrLn "results: " + putStrLn $ "- accuracy: " ++ show (accuracy newsession) + putStrLn $ "- cost (cross-entropy): " ++ show cost + + dataset :: IO ([Vector Double], [Vector Double]) + dataset = do + let dir = "examples/notMNIST/" + + groups <- filter ((/= '.') . head) <$> listDirectory dir + + inputFiles <- mapM (listDirectory . (dir ++)) groups + + let n = 512 {-- minimum (map length inputFiles) --} + numbers = map (`div` n) [0..n * length groups - 1] + inputFilesFull = map (\(i, g) -> map ((dir ++ i ++ "/") ++) g) (zip groups inputFiles) + + + inputImages <- mapM (mapM readImage . take n) inputFilesFull + + let names = map (take n) inputFilesFull + + let (l, r) = partitionEithers $ concat inputImages + inputs = map (fromPixels . convertRGB8) r + labels = map (\i -> V.replicate i 0 `V.snoc` 1 V.++ V.replicate (9 - i) 0) numbers + + pairs = zip inputs labels + + shuffled <- shuffleM pairs + return (map fst shuffled, map snd shuffled) + + where + fromPixels :: Image PixelRGB8 -> Vector Double + fromPixels img@Image { .. } = + let pairs = [(x, y) | x <- [0..imageWidth - 1], y <- [0..imageHeight - 1]] + in V.fromList $ map iter pairs + where + iter (x, y) = + let (PixelRGB8 r g b) = convertPixel $ pixelAt img x y + in + if r == 0 && g == 0 && b == 0 then 0 else 1 + + diff --git a/examples/recurrent-doc-classifier.hs b/examples/recurrent-doc-classifier.hs new file mode 100644 index 0000000..e69de29 diff --git a/examples/sentiment-analysis-data b/examples/sentiment-analysis-data deleted file mode 100755 index f0a3f50..0000000 --- a/examples/sentiment-analysis-data +++ /dev/null @@ -1,5 +0,0 @@ -XSym -0040 -3666c4cacaf995ebd11ef25aab70de99 -../../sibe-repos/sentiment-analysis-data - \ No newline at end of file diff --git a/examples/sin.hs b/examples/sin.hs new file mode 100644 index 0000000..8dc2616 --- /dev/null +++ b/examples/sin.hs @@ -0,0 +1,35 @@ +module Main where + import Sibe + import Numeric.LinearAlgebra + import Data.List + import Debug.Trace + + main = do + let alpha = 0.5 + epochs = 1000 + a = (sigmoid, sigmoid') + lo = (sigmoid, (\_ -> 1)) -- cross entropy + -- a = (relu, relu') + rnetwork = randomNetwork 0 (-1, 1) 1 [(50, a)] (1, lo) + + inputs = map (\a -> vector [a]) (reverse [0, 30, 45, 60, 90]) + labels = map (\deg -> vector $ [sin $ deg * pi/180]) (reverse [0, 30, 45, 60, 90]) + + initial_cost = zipWith crossEntropy (map (`forward` rnetwork) inputs) labels + + network <- run session inputs rnetwork labels alpha epochs + + let results = map (`forward` network) inputs + rounded = map (map round . toList) results + + cost = zipWith crossEntropy (map (`forward` network) inputs) labels + + putStrLn "parameters: " + putStrLn $ "- inputs: " ++ show inputs + putStrLn $ "- labels: " ++ show labels + putStrLn $ "- learning rate: " ++ show alpha + putStrLn $ "- epochs: " ++ show epochs + {-putStrLn $ "- initial cost (cross-entropy): " ++ show initial_cost-} + putStrLn "results: " + putStrLn $ "- actual result: " ++ show results + {-putStrLn $ "- cost (cross-entropy): " ++ show cost-} diff --git a/examples/xor.hs b/examples/xor.hs index 5ed8d27..92eedcf 100644 --- a/examples/xor.hs +++ b/examples/xor.hs @@ -3,30 +3,37 @@ module Main where import Numeric.LinearAlgebra import Data.List import Debug.Trace + import Data.Default.Class main = do - let learning_rate = 0.5 - (iterations, epochs) = (2, 1000) - a = (sigmoid, sigmoid') - rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output + let a = (sigmoid, sigmoid') + rnetwork = randomNetwork 0 (-1, 1) 2 [(2, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]] labels = [vector [1], vector [1], vector [0], vector [0]] - initial_cost = zipWith crossEntropy (map (`forward` rnetwork) inputs) labels + session = def { network = rnetwork + , learningRate = 0.5 + , epochs = 1000 + , training = zip inputs labels + , test = zip inputs labels + } :: Session - network = session inputs rnetwork labels learning_rate (iterations, epochs) - results = map (`forward` network) inputs + initialCost = crossEntropy session + + newsession <- run gd session + + let results = map (`forward` newsession) inputs rounded = map (map round . toList) results - cost = zipWith crossEntropy (map (`forward` network) inputs) labels + cost = crossEntropy newsession putStrLn "parameters: " putStrLn $ "- inputs: " ++ show inputs putStrLn $ "- labels: " ++ show labels - putStrLn $ "- learning rate: " ++ show learning_rate - putStrLn $ "- iterations/epochs: " ++ show (iterations, epochs) - putStrLn $ "- initial cost (cross-entropy): " ++ show initial_cost + putStrLn $ "- learning rate: " ++ show (learningRate session) + putStrLn $ "- epochs: " ++ show (epochs session) + putStrLn $ "- initial cost (cross-entropy): " ++ show initialCost putStrLn "results: " putStrLn $ "- actual result: " ++ show results putStrLn $ "- rounded result: " ++ show rounded diff --git a/notmnist-0.png b/notmnist-0.png new file mode 100644 index 0000000..1bc6967 Binary files /dev/null and b/notmnist-0.png differ diff --git a/notmnist-1.png b/notmnist-1.png new file mode 100644 index 0000000..a6aa761 Binary files /dev/null and b/notmnist-1.png differ diff --git a/notmnist-2.png b/notmnist-2.png new file mode 100644 index 0000000..c3252d9 Binary files /dev/null and b/notmnist-2.png differ diff --git a/notmnist.png b/notmnist.png new file mode 100644 index 0000000..867fd5f Binary files /dev/null and b/notmnist.png differ diff --git a/sgd.png b/sgd.png new file mode 100644 index 0000000..1c0c683 Binary files /dev/null and b/sgd.png differ diff --git a/sibe.cabal b/sibe.cabal index d401501..876c55b 100644 --- a/sibe.cabal +++ b/sibe.cabal @@ -15,7 +15,7 @@ cabal-version: >=1.10 library hs-source-dirs: src - exposed-modules: Sibe, Sibe.NaiveBayes + exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP build-depends: base >= 4.7 && < 5 , hmatrix , random @@ -26,16 +26,21 @@ library , regex-pcre , text , stemmer + , vector + , random-shuffle + , data-default-class + , Chart + , Chart-cairo default-language: Haskell2010 -executable sibe-exe - hs-source-dirs: app - main-is: Main.hs - ghc-options: -threaded -rtsopts -with-rtsopts=-N - build-depends: base - , sibe - , hmatrix - default-language: Haskell2010 +--executable sibe-exe + --hs-source-dirs: app + --main-is: Main.hs + --ghc-options: -threaded -rtsopts -with-rtsopts=-N + --build-depends: base + --, sibe + --, hmatrix + --default-language: Haskell2010 executable example-xor hs-source-dirs: examples @@ -44,6 +49,43 @@ executable example-xor build-depends: base , sibe , hmatrix + , data-default-class + default-language: Haskell2010 + +--executable example-sin + --hs-source-dirs: examples + --main-is: sin.hs + --ghc-options: -threaded -rtsopts -with-rtsopts=-N + --build-depends: base + --, sibe + --, hmatrix + --default-language: Haskell2010 + +executable example-424 + hs-source-dirs: examples + main-is: 424encoder.hs + ghc-options: -threaded -rtsopts -with-rtsopts=-N + build-depends: base + , sibe + , hmatrix + , data-default-class + default-language: Haskell2010 + +executable example-notmnist + hs-source-dirs: examples + main-is: notmnist.hs + ghc-options: -threaded -rtsopts -with-rtsopts=-N + build-depends: base + , sibe + , hmatrix + , directory >= 1.2.5.0 + , JuicyPixels == 3.2.7.2 + , vector == 0.11.0.0 + , random + , random-shuffle + , data-default-class + , Chart + , Chart-cairo default-language: Haskell2010 executable example-naivebayes-doc-classifier @@ -57,19 +99,6 @@ executable example-naivebayes-doc-classifier , split default-language: Haskell2010 -executable example-naivebayes-sentiment-analysis - hs-source-dirs: examples - main-is: naivebayes-sentiment-analysis.hs - ghc-options: -threaded -rtsopts -with-rtsopts=-N - build-depends: base - , sibe - , hmatrix - , containers - , split - , directory - , deepseq - default-language: Haskell2010 - test-suite sibe-test type: exitcode-stdio-1.0 hs-source-dirs: test diff --git a/src/Sibe.hs b/src/Sibe.hs index 79d4f08..46a0673 100644 --- a/src/Sibe.hs +++ b/src/Sibe.hs @@ -15,22 +15,37 @@ module Sibe saveNetwork, loadNetwork, train, - session, - shuffle, + gd, + sgd, + run, sigmoid, sigmoid', + softmax, + softmax', + one, relu, relu', crossEntropy, genSeed, - replaceVector + replaceVector, + Session(..), + accuracy, + learningRateDecay ) where import Numeric.LinearAlgebra import System.Random + import System.Random.Shuffle import Debug.Trace - import Data.List (foldl', sortBy) + import Data.List (foldl', sortBy, genericLength, permutations) import System.IO import Control.DeepSeq + import Control.Monad + import qualified Data.Vector.Storable as V + import Data.Default.Class + import System.Exit + + import qualified Graphics.Rendering.Chart.Easy as Chart + import Graphics.Rendering.Chart.Backend.Cairo type LearningRate = Double type Input = Vector Double @@ -48,8 +63,33 @@ module Sibe data Network = O Layer | Layer :- Network deriving (Show) + infixr 5 :- + data Session = Session { network :: Network + , training :: [(Vector Double, Vector Double)] + , test :: [(Vector Double, Vector Double)] + , learningRate :: Double + , epochs :: Int + , epoch :: Int + , batchSize :: Int + , chart :: [(Int, Double, Double)] + , momentum :: Double + } + + emptyNetwork = randomNetwork 0 (0, 0) 0 [] (0, (id, id)) + instance Default Session where + def = Session { network = seq (die "You have not specified a network parameter") emptyNetwork + , training = seq (die "You have not specified training data") [] + , test = seq (die "You have not specified test data") [] + , learningRate = 0.5 + , epochs = 35 + , epoch = 0 + , batchSize = 0 + , chart = [] + , momentum = 0 + } + saveNetwork :: Network -> String -> IO () saveNetwork network file = writeFile file ((show . reverse) (gen network [])) @@ -73,22 +113,24 @@ module Sibe runLayer :: Input -> Layer -> Output runLayer input (Layer !biases !weights _) = input <# weights + biases - forward :: Input -> Network -> Output - forward input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l - forward input (l@(Layer _ _ (fn, _)) :- n) = forward ((fst . activation $ l) $ runLayer input l) n + forward :: Input -> Session -> Output + forward input session = compute input (network session) + where + compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l + compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer input l) n - randomLayer :: Seed -> (Int, Int) -> Activation -> Layer - randomLayer seed (wr, wc) = - let weights = uniformSample seed wr $ replicate wc (-1, 1) - biases = randomVector seed Uniform wc * 2 - 1 + randomLayer :: Seed -> (Int, Int) -> (Double, Double) -> Activation -> Layer + randomLayer seed (wr, wc) (l, u) = + let weights = uniformSample seed wr $ replicate wc (l, u) + biases = randomVector seed Uniform wc * realToFrac u - realToFrac l in Layer biases weights - randomNetwork :: Seed -> Int -> [(Int, Activation)] -> (Int, Activation) -> Network - randomNetwork seed input [] (output, a) = - O $ randomLayer seed (input, output) a - randomNetwork seed input ((h, a):hs) output = - randomLayer seed (input, h) a :- - randomNetwork (seed + 1) h hs output + randomNetwork :: Seed -> (Double, Double) -> Int -> [(Int, Activation)] -> (Int, Activation) -> Network + randomNetwork seed bound input [] (output, a) = + O $ randomLayer seed (input, output) bound a + randomNetwork seed bound input ((h, a):hs) output = + randomLayer seed (input, h) bound a :- + randomNetwork (seed + 1) bound h hs output sigmoid :: Vector Double -> Vector Double sigmoid x = 1 / max (1 + exp (-x)) 1e-10 @@ -96,18 +138,37 @@ module Sibe sigmoid' :: Vector Double -> Vector Double sigmoid' x = sigmoid x * (1 - sigmoid x) + softmax :: Vector Double -> Vector Double + softmax x = cmap (\a -> exp a / s) x + where + s = V.sum $ exp x + + one :: a -> Double + one x = 1 + + softmax' :: Vector Double -> Vector Double + softmax' x = softmax x * (1 - softmax x) + relu :: Vector Double -> Vector Double - relu x = log (max (1 + exp x) 1e-10) + relu = cmap (max 0.1) relu' :: Vector Double -> Vector Double - relu' = sigmoid + relu' = cmap dev + where dev x + | x < 0 = 0 + | otherwise = 1 - crossEntropy :: Output -> Output -> Double - crossEntropy output target = - let pairs = zip (toList output) (toList target) - n = fromIntegral (length pairs) - in (-1 / n) * sum (map f pairs) + crossEntropy :: Session -> Double + crossEntropy session = + let inputs = map fst (test session) + labels = map (toList . snd) (test session) + outputs = map (toList . (`forward` session)) inputs + pairs = zip outputs labels + n = genericLength pairs + + in sum (map set pairs) / n where + set (os, ls) = (-1 / genericLength os) * sum (zipWith (curry f) os ls) f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10) train :: Input @@ -138,35 +199,137 @@ module Sibe o = fn y (n', delta) = run o n - de = delta * fn' y -- quadratic cost + de = delta * fn' y - biases' = biases - scale alpha de - weights' = weights - scale alpha (input `outer` de) + biases' = biases - cmap (*alpha) de + weights' = weights - cmap (*alpha) (input `outer` de) layer = Layer biases' weights' (fn, fn') pass = weights #> de -- pass = weights #> de in (layer :- n', pass) - session :: [Input] -> Network -> [Output] -> Double -> (Int, Int) -> Network - session inputs network labels alpha (iterations, epochs) = - let n = length inputs - indexes = shuffle n (map (`mod` n) [0..n * epochs]) - in foldl' iter network indexes + {-trainMomentum :: Input + -> Network + -> Output -- target + -> Double -- learning rate + -> (Double, Double) -- momentum + -> Network -- network's output + trainMomentum input network target alpha (m, v) = fst $ run input network where - iter net i = - let n = length inputs - index = i `mod` n - input = inputs !! index - label = labels !! index - in foldl' (\net _ -> train input net label alpha) net [0..iterations] + run :: Input -> Network -> (Network, Vector Double) + run input (O l@(Layer biases weights (fn, fn'))) = + let y = runLayer input l + o = fn y + delta = o - target + de = delta * fn' y + v = + -- de = delta -- cross entropy cost - shuffle :: Seed -> [a] -> [a] - shuffle seed list = - let ords = map ord $ take (length list) (randomRs (0, 1) (mkStdGen seed) :: [Int]) - in map snd $ sortBy (\x y -> fst x) (zip ords list) - where ord x | x == 0 = LT - | x == 1 = GT + biases' = biases - scale alpha de + weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly + layer = Layer biases' weights' (fn, fn') -- updated layer + + pass = weights #> de + -- pass = weights #> de + + in (O layer, pass) + run input (l@(Layer biases weights (fn, fn')) :- n) = + let y = runLayer input l + o = fn y + (n', delta) = run o n + + de = delta * fn' y + + biases' = biases - cmap (*alpha) de + weights' = weights - cmap (*alpha) (input `outer` de) + layer = Layer biases' weights' (fn, fn') + + pass = weights #> de + -- pass = weights #> de + in (layer :- n', pass)-} + + gd :: Session -> IO Session + gd session = do + seed <- newStdGen + + let pairs = training session + alpha = learningRate session + net = network session + + let n = length pairs + + shuffled <- shuffleM pairs + + let newnet = foldl' (\n (input, label) -> train input n label alpha) net pairs + + return session { network = newnet + , epoch = epoch session + 1 + } + + sgd :: Session -> IO Session + sgd session = do + seed <- newStdGen + + let pairs = training session + bsize = batchSize session + alpha = learningRate session + net = network session + + let n = length pairs + iterations = n `div` bsize - 1 + + shuffled <- shuffleM pairs + + let iter net i = + let n = length pairs + batch = take bsize . drop (i * bsize) $ shuffled + batchInputs = map fst batch + batchLabels = map snd batch + batchPair = zip batchInputs batchLabels + in foldl' (\n (input, label) -> train input n label alpha) net batchPair + + let newnet = foldl' iter net [0..iterations] + cost = crossEntropy (session { network = newnet }) + + let el = map (\(e, l, _) -> (e, l)) (chart session) + ea = map (\(e, _, a) -> (e, a)) (chart session) + + putStrLn $ (show $ epoch session) ++ " => " ++ (show cost) ++ " @ " ++ (show $ learningRate session) + + toFile Chart.def "sgd.png" $ do + Chart.layoutlr_title Chart..= "loss over time" + Chart.plotLeft (Chart.line "loss" [el]) + Chart.plotRight (Chart.line "learningRate" [ea]) + + return session { network = newnet + , epoch = epoch session + 1 + , chart = (epoch session, cost, learningRate session):chart session + } + + + accuracy :: Session -> Double + accuracy session = + let inputs = map fst (test session) + labels = map snd (test session) + + results = map (`forward` session) inputs + rounded = map (map round . toList) results + + equals = zipWith (==) rounded (map (map round . toList) labels) + in genericLength (filter (== True) equals) / genericLength inputs + + learningRateDecay :: (Double, Double) -> Session -> Session + learningRateDecay (step, m) session = + session { learningRate = max m $ learningRate session / step } + + run :: (Session -> IO Session) + -> Session -> IO Session + run fn session = foldM (\s i -> fn s) session [0..epochs session] + + factorial :: Int -> Int + factorial 0 = 1 + factorial x = x * factorial (x - 1) genSeed :: IO Seed genSeed = do @@ -176,12 +339,7 @@ module Sibe replaceVector :: Vector Double -> Int -> Double -> Vector Double replaceVector vec index value = let list = toList vec - in fromList $ rrow index list - where - rrow index [] = [] - rrow index (x:xs) - | index == index = value:xs - | otherwise = x : rrow (index + 1) xs + in fromList $ take index list ++ value : drop (index + 1) list clip :: Double -> (Double, Double) -> Double clip x (l, u) = min u (max l x) diff --git a/src/Sibe/LogisticRegression.hs b/src/Sibe/LogisticRegression.hs new file mode 100644 index 0000000..e69de29 diff --git a/src/Sibe/NLP.hs b/src/Sibe/NLP.hs new file mode 100644 index 0000000..29e63c5 --- /dev/null +++ b/src/Sibe/NLP.hs @@ -0,0 +1,129 @@ +module Sibe.NLP + (Class, + Document(..), + ordNub, + accuracy, + recall, + precision, + fmeasure, + cleanText, + cleanDocuments, + removeWords, + removeStopwords, + ngram, + ngramText, + ) + where + import Data.List + import Debug.Trace + import qualified Data.Set as Set + import Data.List.Split + import Data.Maybe + import Control.Arrow ((&&&)) + import Text.Regex.PCRE + import Data.Char (isSpace, isNumber, toLower) + import NLP.Stemmer + + type Class = Int; + + data Document = Document { text :: String + , c :: Class + } deriving (Eq, Show, Read) + + + cleanText :: String -> String + cleanText string = + let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string) + spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r'] + stemmed = unwords $ map (stem Porter) (words spacify) + nonumber = filter (not . isNumber) stemmed + lower = map toLower nonumber + in (unwords . words) lower -- remove unnecessary spaces + where + trim = f . f + where + f = reverse . dropWhile isSpace + replace needle replacement = + map (\c -> if c == needle then replacement else c) + + cleanDocuments :: [Document] -> [Document] + cleanDocuments documents = + let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents + in cleaned + + removeWords :: [String] -> [Document] -> [Document] + removeWords ws documents = + map (\(Document text c) -> Document (rm ws text) c) documents + where + rm list text = + unwords $ filter (`notElem` list) (words text) + + removeStopwords :: Int -> [Document] -> [Document] + removeStopwords i documents = + let wc = wordCounts (concatDocs documents) + wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc + stopwords = map fst (take i wlist) + in removeWords stopwords documents + where + vocabulary x = ordNub (words x) + countWordInDoc d w = genericLength (filter (==w) d) + wordCounts x = + let voc = vocabulary x + in zip voc $ map (countWordInDoc (words x)) voc + + concatDocs = concatMap (\(Document text _) -> text ++ " ") + + ordNub :: (Ord a) => [a] -> [a] + ordNub = go Set.empty + where + go _ [] = [] + go s (x:xs) = if x `Set.member` s then go s xs + else x : go (Set.insert x s) xs + + accuracy :: [(Int, (Int, Double))] -> Double + accuracy results = + let pairs = map (\(a, b) -> (a, fst b)) results + correct = filter (uncurry (==)) pairs + in genericLength correct / genericLength results + + recall :: [(Int, (Int, Double))] -> Double + recall results = + let classes = ordNub (map fst results) + s = sum (map rec classes) / genericLength classes + in s + where + rec a = + let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results + y = genericLength $ filter (\(c, (r, _)) -> c == a) results + in t / y + + precision :: [(Int, (Int, Double))] -> Double + precision results = + let classes = ordNub (map fst results) + s = sum (map prec classes) / genericLength classes + in s + where + prec a = + let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results + y = genericLength $ filter (\(c, (r, _)) -> r == a) results + in + if y == 0 + then 0 + else t / y + + fmeasure :: [(Int, (Int, Double))] -> Double + fmeasure results = + let r = recall results + p = precision results + in (2 * p * r) / (p + r) + + ngram :: Int -> [Document] -> [Document] + ngram n documents = + map (\(Document text c) -> Document (ngramText n text) c) documents + + ngramText :: Int -> String -> String + ngramText n text = + let ws = words text + pairs = zip [0..] ws + grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs + in unwords ("_":grams) diff --git a/src/Sibe/NaiveBayes.hs b/src/Sibe/NaiveBayes.hs index 7afe1f6..85657c4 100644 --- a/src/Sibe/NaiveBayes.hs +++ b/src/Sibe/NaiveBayes.hs @@ -1,7 +1,7 @@ module Sibe.NaiveBayes (Document(..), NB(..), - train, + initialize, run, session, ordNub, @@ -19,21 +19,13 @@ module Sibe.NaiveBayes removeStopwords, ) where + import Sibe.NLP import Data.List import Debug.Trace import qualified Data.Set as Set import Data.List.Split import Data.Maybe import Control.Arrow ((&&&)) - import Text.Regex.PCRE - import Data.Char (isSpace, isNumber, toLower) - import NLP.Stemmer - - type Class = Int; - - data Document = Document { text :: String - , c :: Class - } deriving (Eq, Show, Read) data NB = NB { documents :: [Document] , classes :: [(Class, Double)] @@ -44,8 +36,8 @@ module Sibe.NaiveBayes , cgram :: [(Class, [(String, Int)])] } deriving (Eq, Show, Read) - train :: [Document] -> [Class] -> NB - train documents classes = + initialize :: [Document] -> [Class] -> NB + initialize documents classes = let megadoc = concatDocs documents vocabulary = genericLength ((ordNub . words) megadoc) -- (class, prior probability) @@ -83,17 +75,6 @@ module Sibe.NaiveBayes classWordsCounts x = wordsCount (classWords x) (classVocabulary x) classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x) - ngram :: Int -> [Document] -> [Document] - ngram n documents = - map (\(Document text c) -> Document (ngramText n text) c) documents - - ngramText :: Int -> String -> String - ngramText n text = - let ws = words text - pairs = zip [0..] ws - grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs - in unwords ("_":grams) - session :: [Document] -> NB -> [(Class, (Class, Double))] session docs nb = let results = map (\(Document text c) -> (c, run text nb)) docs @@ -143,91 +124,5 @@ module Sibe.NaiveBayes variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1) in sqrt variance - cleanText :: String -> String - cleanText string = - let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string) - spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r'] - stemmed = unwords $ map (stem Porter) (words spacify) - nonumber = filter (not . isNumber) stemmed - lower = map toLower nonumber - in (unwords . words) lower -- remove unnecessary spaces - where - trim = f . f - where - f = reverse . dropWhile isSpace - replace needle replacement = - map (\c -> if c == needle then replacement else c) - - cleanDocuments :: [Document] -> [Document] - cleanDocuments documents = - let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents - in cleaned - - removeWords :: [String] -> [Document] -> [Document] - removeWords ws documents = - map (\(Document text c) -> Document (rm ws text) c) documents - where - rm list text = - unwords $ filter (`notElem` list) (words text) - - removeStopwords :: Int -> [Document] -> [Document] - removeStopwords i documents = - let wc = wordCounts (concatDocs documents) - wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc - stopwords = map fst (take i wlist) - in removeWords stopwords documents - where - vocabulary x = ordNub (words x) - countWordInDoc d w = genericLength (filter (==w) d) - wordCounts x = - let voc = vocabulary x - in zip voc $ map (countWordInDoc (words x)) voc - - concatDocs = concatMap (\(Document text _) -> text ++ " ") - l :: (Show a) => a -> a l a = trace (show a) a - - ordNub :: (Ord a) => [a] -> [a] - ordNub = go Set.empty - where - go _ [] = [] - go s (x:xs) = if x `Set.member` s then go s xs - else x : go (Set.insert x s) xs - - accuracy :: [(Int, (Int, Double))] -> Double - accuracy results = - let pairs = map (\(a, b) -> (a, fst b)) results - correct = filter (uncurry (==)) pairs - in genericLength correct / genericLength results - - recall :: [(Int, (Int, Double))] -> Double - recall results = - let classes = ordNub (map fst results) - s = sum (map rec classes) / genericLength classes - in s - where - rec a = - let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results - y = genericLength $ filter (\(c, (r, _)) -> c == a) results - in t / y - - precision :: [(Int, (Int, Double))] -> Double - precision results = - let classes = ordNub (map fst results) - s = sum (map prec classes) / genericLength classes - in s - where - prec a = - let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results - y = genericLength $ filter (\(c, (r, _)) -> r == a) results - in - if y == 0 - then 0 - else t / y - - fmeasure :: [(Int, (Int, Double))] -> Double - fmeasure results = - let r = recall results - p = precision results - in (2 * p * r) / (p + r) diff --git a/stack.yaml b/stack.yaml index d02fe3e..5960cdb 100644 --- a/stack.yaml +++ b/stack.yaml @@ -1,40 +1,5 @@ -# This file was automatically generated by 'stack init' -# -# Some commonly used options have been documented as comments in this file. -# For advanced use and comprehensive documentation of the format, please see: -# http://docs.haskellstack.org/en/stable/yaml_configuration/ - -# Resolver to choose a 'specific' stackage snapshot or a compiler version. -# A snapshot resolver dictates the compiler version and the set of packages -# to be used for project dependencies. For example: -# -# resolver: lts-3.5 -# resolver: nightly-2015-09-21 -# resolver: ghc-7.10.2 -# resolver: ghcjs-0.1.0_ghc-7.10.2 -# resolver: -# name: custom-snapshot -# location: "./custom-snapshot.yaml" resolver: lts-6.7 -# User packages to be built. -# Various formats can be used as shown in the example below. -# -# packages: -# - some-directory -# - https://example.com/foo/bar/baz-0.0.2.tar.gz -# - location: -# git: https://github.com/commercialhaskell/stack.git -# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a -# - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a -# extra-dep: true -# subdirs: -# - auto-update -# - wai -# -# A package marked 'extra-dep: true' will only be built if demanded by a -# non-dependency (i.e. a user package), and its test suites and benchmarks -# will not be run. This is useful for tweaking upstream packages. packages: - location: git: git@github.com:albertoruiz/hmatrix.git @@ -42,36 +7,11 @@ packages: subdirs: - packages/base - . -- http://hackage.haskell.org/package/containers-0.5.7.1/containers-0.5.7.1.tar.gz -- http://hackage.haskell.org/package/text-1.2.2.1/text-1.2.2.1.tar.gz -- http://hackage.haskell.org/package/stemmer-0.5.2/stemmer-0.5.2.tar.gz -# Dependency packages to be pulled from upstream that are not in the resolver -# (e.g., acme-missiles-0.3) -extra-deps: [] - -# Override default flag values for local packages and extra-deps -flags: {} - -# Extra package databases containing global packages -extra-package-dbs: [] - -# Control whether we use the GHC we find on the path -# system-ghc: true -# -# Require a specific version of stack, using version ranges -# require-stack-version: -any # Default -# require-stack-version: ">=1.1" -# -# Override the architecture used by stack, especially useful on Windows -# arch: i386 -# arch: x86_64 -# -# Extra directories used by stack for building -# extra-include-dirs: [/path/to/dir] -# extra-lib-dirs: [/path/to/dir] -# -# Allow a newer minor version of GHC than the snapshot specifies -# compiler-check: newer-minor - -system-ghc: false +extra-deps: + - directory-1.2.7.0 + - text-1.2.2.1 + - stemmer-0.5.2 + - containers-0.5.7.1 + - Chart-1.8 + - Chart-cairo-1.8