feat(rnn): recurrent neural networks, experimental

WIP: runs out of memory quickly
This commit is contained in:
Mahdi Dibaiee 2016-10-25 20:23:55 +03:30
parent 44f2ae372a
commit 728df02fbd
7 changed files with 20359 additions and 7 deletions

65
examples/recurrent.hs Normal file
View File

@ -0,0 +1,65 @@
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Main where
import Numeric.LinearAlgebra
import Numeric.Sibe.Recurrent
import Numeric.Sibe.Utils
import System.IO
import Data.Default.Class
import Data.List (genericLength)
import qualified Data.ByteString.Lazy.Char8 as BSL
main = do
texts <- lines <$> readFile "examples/reddit.csv"
let (vocabulary, indexes) = processData texts
let settings = def { wordD = length vocabulary }
r = randomRecurrent 0 settings
let x0 = reverse . drop 1 . reverse $ indexes !! 0
y0 = drop 1 $ indexes !! 0
print $ x0
print $ y0
let xs = map (reverse . drop 1 . reverse) indexes
ys = map (drop 1) indexes
let tov = fromList . map fromIntegral
let vys = map tov ys
let newr = sgd r (take 1 xs) (take 1 vys) 0.005 1
saveRecurrent "recurrent.trained" (show newr) 512
--writeFile "recurrent.trained" (show newr)
let newpredicted = predict newr x0
print $ y0
print $ newpredicted
print $ loss (tov y0) (tov newpredicted)
{-let (dU, dV, dW) = backprop r x0 (fromList $ map fromIntegral y0)-}
{-print $ seq u "u"-}
{-print $ seq v "v"-}
{-print $ seq w "w"-}
--print $ dW
print "done"
saveRecurrent :: FilePath -> String -> Int -> IO ()
saveRecurrent path str chunkSize = do
handle <- openFile path AppendMode
hSetBuffering handle NoBuffering
loop handle str
hClose handle
where
loop _ [] = return ()
loop handle s = do
hPutStr handle $ take chunkSize s
hFlush handle
putStr $ take chunkSize s
loop handle $ drop chunkSize s

20087
examples/reddit.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -31,7 +31,7 @@ module Main where
setStdGen (mkStdGen 100)
sws <- lines <$> readFile "examples/stopwords"
-- real data, takes a lot of time to train
-- real data, currently faces a memory problem
{-ds <- do-}
{-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
{-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}

View File

@ -1,5 +1,5 @@
name: sibe
version: 0.2.0.0
version: 0.2.0.1
synopsis: Machine Learning algorithms
description: Haskell Machine Learning
homepage: https://github.com/mdibaiee/sibe
@ -15,7 +15,13 @@ cabal-version: >=1.10
library
hs-source-dirs: src
exposed-modules: Numeric.Sibe, Numeric.Sibe.NaiveBayes, Numeric.Sibe.NLP, Numeric.Sibe.Word2Vec, Numeric.Sibe.Utils
exposed-modules: Numeric.Sibe,
Numeric.Sibe.NaiveBayes,
Numeric.Sibe.NLP,
Numeric.Sibe.Word2Vec,
Numeric.Sibe.Utils,
Numeric.Sibe.Recurrent
build-depends: base >= 4.7 && < 5
, hmatrix
, random
@ -29,8 +35,8 @@ library
, vector
, random-shuffle
, data-default-class
, Chart
, Chart-cairo
, Chart >= 1.8 && < 2
, Chart-cairo >= 1.8 && < 2
, lens
default-language: Haskell2010
@ -58,6 +64,21 @@ executable example-word2vec
, random
default-language: Haskell2010
executable example-recurrent
hs-source-dirs: examples
main-is: recurrent.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2
build-depends: base
, sibe
, hmatrix
, data-default-class
, split
, vector
, directory
, random
, bytestring
default-language: Haskell2010
executable example-424
hs-source-dirs: examples
main-is: 424encoder.hs

View File

@ -112,8 +112,8 @@ module Numeric.Sibe.NaiveBayes
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
argmax :: (Ord a) => [a] -> Int
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
{-argmax :: (Ord a) => [a] -> Int-}
{-argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)-}
mean :: [Double] -> Double
mean x = sum x / genericLength x

View File

@ -0,0 +1,144 @@
{-# LANGUAGE GADTs #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE TypeOperators #-}
module Numeric.Sibe.Recurrent
( Recurrent (..)
, randomRecurrent
, processData
, forward
, predict
, loss
, backprop
, sgd
) where
import Numeric.LinearAlgebra
import System.Random
import System.Random.Shuffle
import Debug.Trace
import qualified Data.List as L
import Data.Maybe
import System.IO
import Control.DeepSeq
import Control.Monad
import qualified Data.Vector.Storable as V
import Data.Default.Class
import qualified Graphics.Rendering.Chart.Easy as Chart
import Graphics.Rendering.Chart.Backend.Cairo
import Numeric.Sibe.Utils
import Debug.Trace
processData :: [String] -> ([(Int, String)], [[Int]])
processData x =
let setokens = map (\a -> " <start> " ++ a ++ " <end> ") x
tokenized = map tokenize setokens
vocabulary = zip [0..] (unique . concat $ tokenized)
indexes = map (\a -> fst . fromJust $ L.find ((==a) . snd) vocabulary)
in (vocabulary, map indexes tokenized)
data Recurrent = Recurrent { bpttThreshold :: Int
, wordD :: Int
, hiddenD :: Int
, u :: Matrix Double
, v :: Matrix Double
, w :: Matrix Double
} deriving (Show, Read)
instance Default Recurrent where
def = Recurrent { bpttThreshold = 3
, hiddenD = 100
}
randomRecurrent :: Seed -> Recurrent -> Recurrent
randomRecurrent seed r = r { u = randomMatrix (wordD r, hiddenD r) (bounds $ wordD r)
, v = randomMatrix (hiddenD r, wordD r) (bounds $ hiddenD r)
, w = randomMatrix (hiddenD r, hiddenD r) (bounds $ hiddenD r)
}
where
randomMatrix (wr, wc) (l, u) = uniformSample (seed + wr + wc) wr $ replicate wc (l, u)
bounds x = (negate . sqrt $ 1 / fromIntegral x, sqrt $ 1 / fromIntegral x)
forward :: Recurrent -> [Int] -> (Matrix Double, Matrix Double)
forward r input =
let (h, o) = helper [vector (replicate (hiddenD r) 0)] [] input
in (fromRows h, fromRows o)
where
helper hs os [] = (hs, os)
helper (h:hs) os (i:is) =
let k = w r #> h
newh = V.map tanh $ (u r ! i) + k
o = softmax $ newh <# v r
in helper (newh:h:hs) (o:os) is
predict :: Recurrent -> [Int] -> [Int]
predict r i =
let (_, o) = forward r i
in map argmax (toLists o)
backprop :: Recurrent -> [Int] -> Vector Double -> (Matrix Double, Matrix Double, Matrix Double)
backprop r input y =
let dU = zero (u r)
dV = zero (v r)
dW = zero (w r)
in bp dU dV dW (zip [0..] input)
where
(hs, os) = forward r input
-- delta
dO = fromColumns $ zipWith (\i o -> if i `V.elem` y then o - 1 else o) [0..] (toColumns os)
bp dU dV dW [] = (dU, dV, dW)
bp dU dV dW ((i,x):xs) =
let ndV = dV + (hs ! i) `outer` (dO ! i)
dT = (v r) #> (dO ! i) -- * (1 - (hs ! i)^2)
threshold = bpttThreshold r
(ndU, ndW) = tt dU dW dT [max 0 (i-threshold)..i]
in bp ndU ndV ndW xs
where
tt dU dW dT [] = (dU, dW)
tt dU dW dT (c:cs) =
let ndW = dW + (dT `outer` (hs ! (max 0 $ c - 1)))
zdT = vector $ replicate (V.length dT) 0
mdT = fromRows $ replicate (max 0 $ c - 1) zdT ++ [dT] ++ replicate (min (rows dU - 1) $ rows dU - c) zdT
ndU = dU + mdT
ndT = (w r) #> dT
in tt ndU ndW ndT cs
zero m = ((rows m)><(cols m)) $ repeat 0
{-gradientCheck :: Recurrent -> [Int] -> Vector Double -> Double-}
sgdStep :: Recurrent -> [Int] -> Vector Double -> Double -> Recurrent
sgdStep r input y learningRate =
let (dU, dV, dW) = backprop r input y
in r { u = (u r) - scale learningRate dU
, v = (v r) - scale learningRate dV
, w = (w r) - scale learningRate dW
}
sgd :: Recurrent -> [[Int]] -> [Vector Double] -> Double -> Int -> Recurrent
sgd r input y learningRate epochs = run [0..epochs] r
where
run [] r = r
run (i:is) r = run is $ train (zip input y) r
train [] r = r
train ((x, y):xs) r = train xs $ sgdStep r x y learningRate
softmax :: Vector Double -> Vector Double
softmax x = cmap (\a -> exp a / s) x
where
s = V.sum $ exp x
softmax' :: Vector Double -> Vector Double
softmax' = cmap (\a -> sig a * (1 - sig a))
where
sig x = 1 / max (1 + exp (-x)) 1e-10
-- cross-entropy
loss :: Vector Double -> Vector Double -> Double
loss ys os = (-1 / fromIntegral (V.length os)) * V.sum (V.zipWith f os ys)
where
f a y = y * log (max 1e-10 a)

View File

@ -4,10 +4,19 @@ module Numeric.Sibe.Utils
, onehot
, average
, pca
, tokenize
, frequency
, unique
, argmax
, shape
) where
import qualified Data.Vector.Storable as V
import qualified Data.Set as Set
import Numeric.LinearAlgebra
import Data.List.Split
import Data.Char (isSpace, isNumber, toLower)
import Control.Arrow ((&&&))
import Data.List
similarity :: Vector Double -> Vector Double -> Double
similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
@ -24,6 +33,8 @@ module Numeric.Sibe.Utils
go _ [] = []
go s (x:xs) = if x `Set.member` s then go s xs
else x : go (Set.insert x s) xs
unique :: (Ord a) => [a] -> [a]
unique = ordNub
average :: Vector Double -> Vector Double
average v = cmap (/ (V.sum v)) v
@ -39,3 +50,27 @@ module Numeric.Sibe.Utils
diagS = diagRect 0 s (rows mat) (cols mat)
in u ?? (All, Take d) <> diagS ?? (Take d, Take d)
tokenize :: String -> [String]
tokenize str =
let spaced = spacify str
ws = words spaced
in ws
where
puncs = ['!', '"', '#', '$', '%', '(', ')', '.', '?', ',', '\'', '/', '-']
replace needle replacement =
concatMap (\c -> if c == needle then replacement else c)
spacify = foldl (\acc c -> if c `elem` puncs then acc ++ [' ', c, ' '] else acc ++ [c]) ""
frequency :: (Ord a) => [a] -> [(a, Int)]
frequency = map (head &&& length) . group . sort
argmax :: (Foldable t, Num a, Fractional a, Ord a) => t a -> Int
argmax v = snd $ foldl mx ((-1/0), 0) v
where
mx (a, i) b
| b > a = (b, i + 1)
| otherwise = (a, i)
shape :: Matrix a -> (Int, Int)
shape x = (rows x, cols x)