feat(rnn): recurrent neural networks, experimental

WIP: runs out of memory quickly
2016-10-25 20:23:55 +03:30
parent 44f2ae372a
commit 728df02fbd
7 changed files with 20359 additions and 7 deletions
--- a/examples/recurrent.hs
+++ b/examples/recurrent.hs
@ -0,0 +1,65 @@
+{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+module Main where
+  import Numeric.LinearAlgebra
+  import Numeric.Sibe.Recurrent
+  import Numeric.Sibe.Utils
+  import System.IO
+  import Data.Default.Class
+  import Data.List (genericLength)
+  import qualified Data.ByteString.Lazy.Char8 as BSL
+
+  main = do
+    texts <- lines <$> readFile "examples/reddit.csv"
+    let (vocabulary, indexes) = processData texts
+
+    let settings = def { wordD = length vocabulary }
+        r = randomRecurrent 0 settings
+
+    let x0 = reverse . drop 1 . reverse $ indexes !! 0
+        y0 = drop 1 $ indexes !! 0
+
+    print $ x0
+    print $ y0
+
+    let xs = map (reverse . drop 1 . reverse) indexes
+        ys = map (drop 1) indexes
+
+    let tov = fromList . map fromIntegral
+    let vys = map tov ys
+
+    let newr = sgd r (take 1 xs) (take 1 vys) 0.005 1
+
+    saveRecurrent "recurrent.trained" (show newr) 512
+    --writeFile "recurrent.trained" (show newr)
+
+    let newpredicted = predict newr x0
+    print $ y0
+    print $ newpredicted
+
+    print $ loss (tov y0) (tov newpredicted)
+
+    {-let (dU, dV, dW) = backprop r x0 (fromList $ map fromIntegral y0)-}
+    {-print $ seq u "u"-}
+    {-print $ seq v "v"-}
+    {-print $ seq w "w"-}
+
+    --print $ dW
+    print "done"
+
+  saveRecurrent :: FilePath -> String -> Int -> IO ()
+  saveRecurrent path str chunkSize = do
+    handle <- openFile path AppendMode
+    hSetBuffering handle NoBuffering
+    loop handle str
+    hClose handle
+    where
+      loop _ [] = return ()
+      loop handle s = do
+        hPutStr handle $ take chunkSize s
+        hFlush handle
+        putStr $ take chunkSize s
+        loop handle $ drop chunkSize s
+    
--- a/examples/reddit.csv
+++ b/examples/reddit.csv
--- a/examples/word2vec.hs
+++ b/examples/word2vec.hs
@ -31,7 +31,7 @@ module Main where
    setStdGen (mkStdGen 100)
    sws <- lines <$> readFile "examples/stopwords"

-    -- real data, takes a lot of time to train
+    -- real data, currently faces a memory problem
    {-ds <- do-}
        {-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
        {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
--- a/sibe.cabal
+++ b/sibe.cabal
@ -1,5 +1,5 @@
 name:                sibe
-version:             0.2.0.0
+version:             0.2.0.1
 synopsis:            Machine Learning algorithms
 description:         Haskell Machine Learning
 homepage:            https://github.com/mdibaiee/sibe
@ -15,7 +15,13 @@ cabal-version:       >=1.10

 library
  hs-source-dirs:      src
-  exposed-modules:     Numeric.Sibe, Numeric.Sibe.NaiveBayes, Numeric.Sibe.NLP, Numeric.Sibe.Word2Vec, Numeric.Sibe.Utils
+  exposed-modules:     Numeric.Sibe,
+                       Numeric.Sibe.NaiveBayes,
+                       Numeric.Sibe.NLP,
+                       Numeric.Sibe.Word2Vec,
+                       Numeric.Sibe.Utils,
+                       Numeric.Sibe.Recurrent
+
  build-depends:       base >= 4.7 && < 5
                     , hmatrix
                     , random
@ -29,8 +35,8 @@ library
                     , vector
                     , random-shuffle
                     , data-default-class
-                     , Chart
-                     , Chart-cairo
+                     , Chart >= 1.8 && < 2
+                     , Chart-cairo >= 1.8 && < 2
                     , lens
  default-language:    Haskell2010

@ -58,6 +64,21 @@ executable example-word2vec
                     , random
  default-language:    Haskell2010

+executable example-recurrent
+  hs-source-dirs:      examples
+  main-is:             recurrent.hs
+  ghc-options:         -threaded -rtsopts -with-rtsopts=-N -O2
+  build-depends:       base
+                     , sibe
+                     , hmatrix
+                     , data-default-class
+                     , split
+                     , vector
+                     , directory
+                     , random
+                     , bytestring
+  default-language:    Haskell2010
+
 executable example-424
  hs-source-dirs:      examples
  main-is:             424encoder.hs
--- a/src/Numeric/Sibe/NaiveBayes.hs
+++ b/src/Numeric/Sibe/NaiveBayes.hs
@ -112,8 +112,8 @@ module Numeric.Sibe.NaiveBayes
          -- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
          in realToFrac (tct + 1) / realToFrac (cvoc + voc)

-    argmax :: (Ord a) => [a] -> Int
-    argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
+    {-argmax :: (Ord a) => [a] -> Int-}
+    {-argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)-}

    mean :: [Double] -> Double
    mean x = sum x / genericLength x
--- a/src/Numeric/Sibe/Recurrent.hs
+++ b/src/Numeric/Sibe/Recurrent.hs
@ -0,0 +1,144 @@
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE TypeOperators #-}
+
+module Numeric.Sibe.Recurrent
+  ( Recurrent (..)
+  , randomRecurrent
+  , processData
+  , forward
+  , predict
+  , loss
+  , backprop
+  , sgd
+  ) where
+    import Numeric.LinearAlgebra
+    import System.Random
+    import System.Random.Shuffle
+    import Debug.Trace
+    import qualified Data.List as L
+    import Data.Maybe
+    import System.IO
+    import Control.DeepSeq
+    import Control.Monad
+    import qualified Data.Vector.Storable as V
+    import Data.Default.Class
+
+    import qualified Graphics.Rendering.Chart.Easy as Chart
+    import Graphics.Rendering.Chart.Backend.Cairo
+    import Numeric.Sibe.Utils
+    import Debug.Trace
+
+    processData :: [String] -> ([(Int, String)], [[Int]])
+    processData x =
+      let setokens = map (\a -> " <start> " ++ a ++ " <end> ") x
+          tokenized = map tokenize setokens
+          vocabulary = zip [0..] (unique . concat $ tokenized)
+          indexes = map (\a -> fst . fromJust $ L.find ((==a) . snd) vocabulary)
+      in (vocabulary, map indexes tokenized)
+
+    data Recurrent = Recurrent { bpttThreshold :: Int
+                               , wordD :: Int
+                               , hiddenD :: Int
+                               , u :: Matrix Double
+                               , v :: Matrix Double
+                               , w :: Matrix Double
+                               } deriving (Show, Read)
+    instance Default Recurrent where
+      def = Recurrent { bpttThreshold = 3
+                      , hiddenD = 100
+                      }
+
+    randomRecurrent :: Seed -> Recurrent -> Recurrent
+    randomRecurrent seed r = r { u = randomMatrix (wordD r, hiddenD r) (bounds $ wordD r)
+                               , v = randomMatrix (hiddenD r, wordD r) (bounds $ hiddenD r)
+                               , w = randomMatrix (hiddenD r, hiddenD r) (bounds $ hiddenD r)
+                               }
+      where
+        randomMatrix (wr, wc) (l, u) = uniformSample (seed + wr + wc) wr $ replicate wc (l, u)
+        bounds x = (negate . sqrt $ 1 / fromIntegral x, sqrt $ 1 / fromIntegral x)
+
+
+    forward :: Recurrent -> [Int] -> (Matrix Double, Matrix Double)
+    forward r input = 
+      let (h, o) = helper [vector (replicate (hiddenD r) 0)] [] input
+      in (fromRows h, fromRows o)
+      where
+        helper hs os [] = (hs, os)
+        helper (h:hs) os (i:is) =
+          let k = w r #> h
+              newh = V.map tanh $ (u r ! i) + k
+              o = softmax $ newh <# v r
+          in helper (newh:h:hs) (o:os) is
+
+    predict :: Recurrent -> [Int] -> [Int]
+    predict r i = 
+      let (_, o) = forward r i
+      in map argmax (toLists o)
+
+    backprop :: Recurrent -> [Int] -> Vector Double -> (Matrix Double, Matrix Double, Matrix Double)
+    backprop r input y =
+      let dU = zero (u r)
+          dV = zero (v r)
+          dW = zero (w r)
+      in bp dU dV dW (zip [0..] input)
+      where
+        (hs, os) = forward r input
+        -- delta
+        dO = fromColumns $ zipWith (\i o -> if i `V.elem` y then o - 1 else o) [0..] (toColumns os)
+
+        bp dU dV dW [] = (dU, dV, dW)
+        bp dU dV dW ((i,x):xs) =
+          let ndV = dV + (hs ! i) `outer` (dO ! i)
+              dT = (v r) #> (dO ! i) -- * (1 - (hs ! i)^2)
+              threshold = bpttThreshold r
+              (ndU, ndW) = tt dU dW dT [max 0 (i-threshold)..i]
+           in bp ndU ndV ndW xs
+           where
+             tt dU dW dT [] = (dU, dW)
+             tt dU dW dT (c:cs) =
+               let ndW = dW + (dT `outer` (hs ! (max 0 $ c - 1)))
+                   zdT = vector $ replicate (V.length dT) 0
+                   mdT = fromRows $ replicate (max 0 $ c - 1) zdT ++ [dT] ++ replicate (min (rows dU - 1) $ rows dU - c) zdT
+                   ndU = dU + mdT
+                   ndT = (w r) #> dT
+               in tt ndU ndW ndT cs
+
+        zero m = ((rows m)><(cols m)) $ repeat 0
+
+    {-gradientCheck :: Recurrent -> [Int] -> Vector Double -> Double-}
+
+    sgdStep :: Recurrent -> [Int] -> Vector Double -> Double -> Recurrent
+    sgdStep r input y learningRate =
+      let (dU, dV, dW) = backprop r input y
+      in r { u = (u r) - scale learningRate dU
+           , v = (v r) - scale learningRate dV
+           , w = (w r) - scale learningRate dW
+           }
+
+    sgd :: Recurrent -> [[Int]] -> [Vector Double] -> Double -> Int -> Recurrent
+    sgd r input y learningRate epochs = run [0..epochs] r
+      where
+        run [] r = r
+        run (i:is) r = run is $ train (zip input y) r
+
+        train [] r = r
+        train ((x, y):xs) r = train xs $ sgdStep r x y learningRate
+
+    softmax :: Vector Double -> Vector Double
+    softmax x = cmap (\a -> exp a / s) x
+      where
+        s = V.sum $ exp x
+
+    softmax' :: Vector Double -> Vector Double
+    softmax' = cmap (\a -> sig a * (1 - sig a))
+      where
+        sig x = 1 / max (1 + exp (-x)) 1e-10
+
+    -- cross-entropy
+    loss :: Vector Double -> Vector Double -> Double
+    loss ys os = (-1 / fromIntegral (V.length os)) * V.sum (V.zipWith f os ys)
+      where
+        f a y = y * log (max 1e-10 a)
+
--- a/src/Numeric/Sibe/Utils.hs
+++ b/src/Numeric/Sibe/Utils.hs
@ -4,10 +4,19 @@ module Numeric.Sibe.Utils
  , onehot
  , average
  , pca
+  , tokenize
+  , frequency
+  , unique
+  , argmax
+  , shape
  ) where
    import qualified Data.Vector.Storable as V
    import qualified Data.Set as Set
    import Numeric.LinearAlgebra
+    import Data.List.Split
+    import Data.Char (isSpace, isNumber, toLower)
+    import Control.Arrow ((&&&))
+    import Data.List

    similarity :: Vector Double -> Vector Double -> Double
    similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
@ -24,6 +33,8 @@ module Numeric.Sibe.Utils
        go _ [] = []
        go s (x:xs) = if x `Set.member` s then go s xs
                                          else x : go (Set.insert x s) xs
+    unique :: (Ord a) => [a] -> [a]
+    unique = ordNub

    average :: Vector Double -> Vector Double
    average v = cmap (/ (V.sum v)) v
@ -39,3 +50,27 @@ module Numeric.Sibe.Utils
          diagS = diagRect 0 s (rows mat) (cols mat)

      in u ?? (All, Take d) <> diagS ?? (Take d, Take d)
+
+    tokenize :: String -> [String]
+    tokenize str = 
+      let spaced = spacify str
+          ws = words spaced
+      in ws
+      where
+        puncs = ['!', '"', '#', '$', '%', '(', ')', '.', '?', ',', '\'', '/', '-']
+        replace needle replacement =
+          concatMap (\c -> if c == needle then replacement else c)
+        spacify = foldl (\acc c -> if c `elem` puncs then acc ++ [' ', c, ' '] else acc ++ [c]) ""
+
+    frequency :: (Ord a) => [a] -> [(a, Int)]
+    frequency = map (head &&& length) . group . sort
+
+    argmax :: (Foldable t, Num a, Fractional a, Ord a) => t a -> Int
+    argmax v = snd $ foldl mx ((-1/0), 0) v
+      where
+        mx (a, i) b
+          | b > a = (b, i + 1)
+          | otherwise = (a, i)
+
+    shape :: Matrix a -> (Int, Int)
+    shape x = (rows x, cols x)