feat(rnn): recurrent neural networks, experimental

WIP: runs out of memory quickly
2016-10-25 20:23:55 +03:30
parent 44f2ae372a
commit 728df02fbd
7 changed files with 20359 additions and 7 deletions
--- a/examples/recurrent.hs
+++ b/examples/recurrent.hs
@@ -0,0 +1,65 @@
 {-# LANGUAGE RecordWildCards #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE ScopedTypeVariables #-}
 module Main where
  import Numeric.LinearAlgebra
  import Numeric.Sibe.Recurrent
  import Numeric.Sibe.Utils
  import System.IO
  import Data.Default.Class
  import Data.List (genericLength)
  import qualified Data.ByteString.Lazy.Char8 as BSL
  main = do
    texts <- lines <$> readFile "examples/reddit.csv"
    let (vocabulary, indexes) = processData texts
    let settings = def { wordD = length vocabulary }
        r = randomRecurrent 0 settings
    let x0 = reverse . drop 1 . reverse $ indexes !! 0
        y0 = drop 1 $ indexes !! 0
    print $ x0
    print $ y0
    let xs = map (reverse . drop 1 . reverse) indexes
        ys = map (drop 1) indexes
    let tov = fromList . map fromIntegral
    let vys = map tov ys
    let newr = sgd r (take 1 xs) (take 1 vys) 0.005 1
    saveRecurrent "recurrent.trained" (show newr) 512
    --writeFile "recurrent.trained" (show newr)
    let newpredicted = predict newr x0
    print $ y0
    print $ newpredicted
    print $ loss (tov y0) (tov newpredicted)
    {-let (dU, dV, dW) = backprop r x0 (fromList $ map fromIntegral y0)-}
    {-print $ seq u "u"-}
    {-print $ seq v "v"-}
    {-print $ seq w "w"-}
    --print $ dW
    print "done"
  saveRecurrent :: FilePath -> String -> Int -> IO ()
  saveRecurrent path str chunkSize = do
    handle <- openFile path AppendMode
    hSetBuffering handle NoBuffering
    loop handle str
    hClose handle
    where
      loop _ [] = return ()
      loop handle s = do
        hPutStr handle $ take chunkSize s
        hFlush handle
        putStr $ take chunkSize s
        loop handle $ drop chunkSize s
--- a/examples/reddit.csv
+++ b/examples/reddit.csv
--- a/examples/word2vec.hs
+++ b/examples/word2vec.hs
@@ -31,7 +31,7 @@ module Main where
    setStdGen (mkStdGen 100)
    sws <- lines <$> readFile "examples/stopwords"
-    -- real data, takes a lot of time to train
+    -- real data, currently faces a memory problem
    {-ds <- do-}
        {-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
        {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
--- a/sibe.cabal
+++ b/sibe.cabal
@@ -1,5 +1,5 @@
 name:                sibe
-version:             0.2.0.0
+version:             0.2.0.1
 synopsis:            Machine Learning algorithms
 description:         Haskell Machine Learning
 homepage:            https://github.com/mdibaiee/sibe
@@ -15,7 +15,13 @@ cabal-version:       >=1.10
 library
  hs-source-dirs:      src
-  exposed-modules:     Numeric.Sibe, Numeric.Sibe.NaiveBayes, Numeric.Sibe.NLP, Numeric.Sibe.Word2Vec, Numeric.Sibe.Utils
+  exposed-modules:     Numeric.Sibe,
                       Numeric.Sibe.NaiveBayes,
                       Numeric.Sibe.NLP,
                       Numeric.Sibe.Word2Vec,
                       Numeric.Sibe.Utils,
                       Numeric.Sibe.Recurrent
  build-depends:       base >= 4.7 && < 5
                     , hmatrix
                     , random
@@ -29,8 +35,8 @@ library
                     , vector
                     , random-shuffle
                     , data-default-class
-                     , Chart
+                     , Chart >= 1.8 && < 2
-                     , Chart-cairo
+                     , Chart-cairo >= 1.8 && < 2
                     , lens
  default-language:    Haskell2010
@@ -58,6 +64,21 @@ executable example-word2vec
                     , random
  default-language:    Haskell2010
 executable example-recurrent
  hs-source-dirs:      examples
  main-is:             recurrent.hs
  ghc-options:         -threaded -rtsopts -with-rtsopts=-N -O2
  build-depends:       base
                     , sibe
                     , hmatrix
                     , data-default-class
                     , split
                     , vector
                     , directory
                     , random
                     , bytestring
  default-language:    Haskell2010
 executable example-424
  hs-source-dirs:      examples
  main-is:             424encoder.hs
--- a/src/Numeric/Sibe/NaiveBayes.hs
+++ b/src/Numeric/Sibe/NaiveBayes.hs
@@ -112,8 +112,8 @@ module Numeric.Sibe.NaiveBayes
          -- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
          in realToFrac (tct + 1) / realToFrac (cvoc + voc)
-    argmax :: (Ord a) => [a] -> Int
+    {-argmax :: (Ord a) => [a] -> Int-}
-    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
+    {-argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)-}
    mean :: [Double] -> Double
    mean x = sum x / genericLength x
--- a/src/Numeric/Sibe/Recurrent.hs
+++ b/src/Numeric/Sibe/Recurrent.hs
@@ -0,0 +1,144 @@
 {-# LANGUAGE GADTs #-}
 {-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE DataKinds #-}
 {-# LANGUAGE TypeOperators #-}
 module Numeric.Sibe.Recurrent
  ( Recurrent (..)
  , randomRecurrent
  , processData
  , forward
  , predict
  , loss
  , backprop
  , sgd
  ) where
    import Numeric.LinearAlgebra
    import System.Random
    import System.Random.Shuffle
    import Debug.Trace
    import qualified Data.List as L
    import Data.Maybe
    import System.IO
    import Control.DeepSeq
    import Control.Monad
    import qualified Data.Vector.Storable as V
    import Data.Default.Class
    import qualified Graphics.Rendering.Chart.Easy as Chart
    import Graphics.Rendering.Chart.Backend.Cairo
    import Numeric.Sibe.Utils
    import Debug.Trace
    processData :: [String] -> ([(Int, String)], [[Int]])
    processData x =
      let setokens = map (\a -> " <start> " ++ a ++ " <end> ") x
          tokenized = map tokenize setokens
          vocabulary = zip [0..] (unique . concat $ tokenized)
          indexes = map (\a -> fst . fromJust $ L.find ((==a) . snd) vocabulary)
      in (vocabulary, map indexes tokenized)
    data Recurrent = Recurrent { bpttThreshold :: Int
                               , wordD :: Int
                               , hiddenD :: Int
                               , u :: Matrix Double
                               , v :: Matrix Double
                               , w :: Matrix Double
                               } deriving (Show, Read)
    instance Default Recurrent where
      def = Recurrent { bpttThreshold = 3
                      , hiddenD = 100
                      }
    randomRecurrent :: Seed -> Recurrent -> Recurrent
    randomRecurrent seed r = r { u = randomMatrix (wordD r, hiddenD r) (bounds $ wordD r)
                               , v = randomMatrix (hiddenD r, wordD r) (bounds $ hiddenD r)
                               , w = randomMatrix (hiddenD r, hiddenD r) (bounds $ hiddenD r)
                               }
      where
        randomMatrix (wr, wc) (l, u) = uniformSample (seed + wr + wc) wr $ replicate wc (l, u)
        bounds x = (negate . sqrt $ 1 / fromIntegral x, sqrt $ 1 / fromIntegral x)
    forward :: Recurrent -> [Int] -> (Matrix Double, Matrix Double)
    forward r input = 
      let (h, o) = helper [vector (replicate (hiddenD r) 0)] [] input
      in (fromRows h, fromRows o)
      where
        helper hs os [] = (hs, os)
        helper (h:hs) os (i:is) =
          let k = w r #> h
              newh = V.map tanh $ (u r ! i) + k
              o = softmax $ newh <# v r
          in helper (newh:h:hs) (o:os) is
    predict :: Recurrent -> [Int] -> [Int]
    predict r i = 
      let (_, o) = forward r i
      in map argmax (toLists o)
    backprop :: Recurrent -> [Int] -> Vector Double -> (Matrix Double, Matrix Double, Matrix Double)
    backprop r input y =
      let dU = zero (u r)
          dV = zero (v r)
          dW = zero (w r)
      in bp dU dV dW (zip [0..] input)
      where
        (hs, os) = forward r input
        -- delta
        dO = fromColumns $ zipWith (\i o -> if i `V.elem` y then o - 1 else o) [0..] (toColumns os)
        bp dU dV dW [] = (dU, dV, dW)
        bp dU dV dW ((i,x):xs) =
          let ndV = dV + (hs ! i) `outer` (dO ! i)
              dT = (v r) #> (dO ! i) -- * (1 - (hs ! i)^2)
              threshold = bpttThreshold r
              (ndU, ndW) = tt dU dW dT [max 0 (i-threshold)..i]
           in bp ndU ndV ndW xs
           where
             tt dU dW dT [] = (dU, dW)
             tt dU dW dT (c:cs) =
               let ndW = dW + (dT `outer` (hs ! (max 0 $ c - 1)))
                   zdT = vector $ replicate (V.length dT) 0
                   mdT = fromRows $ replicate (max 0 $ c - 1) zdT ++ [dT] ++ replicate (min (rows dU - 1) $ rows dU - c) zdT
                   ndU = dU + mdT
                   ndT = (w r) #> dT
               in tt ndU ndW ndT cs
        zero m = ((rows m)><(cols m)) $ repeat 0
    {-gradientCheck :: Recurrent -> [Int] -> Vector Double -> Double-}
    sgdStep :: Recurrent -> [Int] -> Vector Double -> Double -> Recurrent
    sgdStep r input y learningRate =
      let (dU, dV, dW) = backprop r input y
      in r { u = (u r) - scale learningRate dU
           , v = (v r) - scale learningRate dV
           , w = (w r) - scale learningRate dW
           }
    sgd :: Recurrent -> [[Int]] -> [Vector Double] -> Double -> Int -> Recurrent
    sgd r input y learningRate epochs = run [0..epochs] r
      where
        run [] r = r
        run (i:is) r = run is $ train (zip input y) r
        train [] r = r
        train ((x, y):xs) r = train xs $ sgdStep r x y learningRate
    softmax :: Vector Double -> Vector Double
    softmax x = cmap (\a -> exp a / s) x
      where
        s = V.sum $ exp x
    softmax' :: Vector Double -> Vector Double
    softmax' = cmap (\a -> sig a * (1 - sig a))
      where
        sig x = 1 / max (1 + exp (-x)) 1e-10
    -- cross-entropy
    loss :: Vector Double -> Vector Double -> Double
    loss ys os = (-1 / fromIntegral (V.length os)) * V.sum (V.zipWith f os ys)
      where
        f a y = y * log (max 1e-10 a)
--- a/src/Numeric/Sibe/Utils.hs
+++ b/src/Numeric/Sibe/Utils.hs
@@ -4,10 +4,19 @@ module Numeric.Sibe.Utils
  , onehot
  , average
  , pca
  , tokenize
  , frequency
  , unique
  , argmax
  , shape
  ) where
    import qualified Data.Vector.Storable as V
    import qualified Data.Set as Set
    import Numeric.LinearAlgebra
    import Data.List.Split
    import Data.Char (isSpace, isNumber, toLower)
    import Control.Arrow ((&&&))
    import Data.List
    similarity :: Vector Double -> Vector Double -> Double
    similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
@@ -24,6 +33,8 @@ module Numeric.Sibe.Utils
        go _ [] = []
        go s (x:xs) = if x `Set.member` s then go s xs
                                          else x : go (Set.insert x s) xs
    unique :: (Ord a) => [a] -> [a]
    unique = ordNub
    average :: Vector Double -> Vector Double
    average v = cmap (/ (V.sum v)) v
@@ -39,3 +50,27 @@ module Numeric.Sibe.Utils
          diagS = diagRect 0 s (rows mat) (cols mat)
      in u ?? (All, Take d) <> diagS ?? (Take d, Take d)
    tokenize :: String -> [String]
    tokenize str = 
      let spaced = spacify str
          ws = words spaced
      in ws
      where
        puncs = ['!', '"', '#', '$', '%', '(', ')', '.', '?', ',', '\'', '/', '-']
        replace needle replacement =
          concatMap (\c -> if c == needle then replacement else c)
        spacify = foldl (\acc c -> if c `elem` puncs then acc ++ [' ', c, ' '] else acc ++ [c]) ""
    frequency :: (Ord a) => [a] -> [(a, Int)]
    frequency = map (head &&& length) . group . sort
    argmax :: (Foldable t, Num a, Fractional a, Ord a) => t a -> Int
    argmax v = snd $ foldl mx ((-1/0), 0) v
      where
        mx (a, i) b
          | b > a = (b, i + 1)
          | otherwise = (a, i)
    shape :: Matrix a -> (Int, Int)
    shape x = (rows x, cols x)