Compare commits
10 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
d22d91bd6a | ||
|
83f9ff1c44 | ||
|
fd987ad490 | ||
|
172875e4c2 | ||
|
58d62a06fc | ||
|
854bfe5805 | ||
|
b382d63994 | ||
|
b7b62223d6 | ||
|
94e11e5b7c | ||
|
fa84c7efe9 |
6
LICENSE
6
LICENSE
@ -631,8 +631,8 @@ to attach them to the start of each source file to most effectively
|
|||||||
state the exclusion of warranty; and each file should have at least
|
state the exclusion of warranty; and each file should have at least
|
||||||
the "copyright" line and a pointer to where the full notice is found.
|
the "copyright" line and a pointer to where the full notice is found.
|
||||||
|
|
||||||
{one line to give the program's name and a brief idea of what it does.}
|
<one line to give the program's name and a brief idea of what it does.>
|
||||||
Copyright (C) {year} {name of author}
|
Copyright (C) <year> <name of author>
|
||||||
|
|
||||||
This program is free software: you can redistribute it and/or modify
|
This program is free software: you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -652,7 +652,7 @@ Also add information on how to contact you by electronic and paper mail.
|
|||||||
If the program does terminal interaction, make it output a short
|
If the program does terminal interaction, make it output a short
|
||||||
notice like this when it starts in an interactive mode:
|
notice like this when it starts in an interactive mode:
|
||||||
|
|
||||||
Sibe Copyright (C) 2016 Mahdi Dibaiee
|
<program> Copyright (C) <year> <name of author>
|
||||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||||
This is free software, and you are welcome to redistribute it
|
This is free software, and you are welcome to redistribute it
|
||||||
under certain conditions; type `show c' for details.
|
under certain conditions; type `show c' for details.
|
||||||
|
@ -1,60 +0,0 @@
|
|||||||
{-# LANGUAGE RecordWildCards #-}
|
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
|
||||||
{-# LANGUAGE ScopedTypeVariables #-}
|
|
||||||
|
|
||||||
module Main where
|
|
||||||
import Numeric.LinearAlgebra
|
|
||||||
import Numeric.Sibe.Recurrent
|
|
||||||
import Numeric.Sibe.Utils
|
|
||||||
import System.IO
|
|
||||||
import Data.Default.Class
|
|
||||||
import Data.List (genericLength)
|
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BL
|
|
||||||
|
|
||||||
main = do
|
|
||||||
texts <- lines <$> readFile "examples/reddit.csv"
|
|
||||||
let (vocabulary, indexes) = processData texts
|
|
||||||
|
|
||||||
let settings = def { wordD = length vocabulary }
|
|
||||||
r = randomRecurrent 0 settings
|
|
||||||
|
|
||||||
let x0 = reverse . drop 1 . reverse $ indexes !! 0
|
|
||||||
y0 = drop 1 $ indexes !! 0
|
|
||||||
|
|
||||||
let xs = map (reverse . drop 1 . reverse) indexes
|
|
||||||
ys = map (drop 1) indexes
|
|
||||||
|
|
||||||
let tov = fromList . map fromIntegral
|
|
||||||
let vys = map tov ys
|
|
||||||
|
|
||||||
let newr = sgd r (take 1 xs) (take 1 vys) 0.005 1
|
|
||||||
|
|
||||||
let newpredicted = predict newr x0
|
|
||||||
print $ y0
|
|
||||||
print $ newpredicted
|
|
||||||
|
|
||||||
print $ loss (tov y0) (tov newpredicted)
|
|
||||||
|
|
||||||
print "done"
|
|
||||||
|
|
||||||
saveRecurrent "recurrent.trained" (show newr) 512
|
|
||||||
|
|
||||||
saveRecurrent :: FilePath -> String -> Int -> IO ()
|
|
||||||
saveRecurrent path str chunkSize = do
|
|
||||||
let b = BL.pack str
|
|
||||||
withFile path AppendMode (process b)
|
|
||||||
where
|
|
||||||
process :: BL.ByteString -> Handle -> IO ()
|
|
||||||
process b handle = do
|
|
||||||
hSetBuffering handle NoBuffering
|
|
||||||
loop handle b
|
|
||||||
|
|
||||||
loop :: Handle -> BL.ByteString -> IO ()
|
|
||||||
loop handle s
|
|
||||||
| s == BL.empty = return ()
|
|
||||||
| otherwise = do
|
|
||||||
let (current, next) = BL.splitAt (fromIntegral chunkSize) s
|
|
||||||
BL.hPutStr handle current
|
|
||||||
hFlush handle
|
|
||||||
loop handle next
|
|
||||||
|
|
20087
examples/reddit.csv
20087
examples/reddit.csv
File diff suppressed because it is too large
Load Diff
@ -31,7 +31,7 @@ module Main where
|
|||||||
setStdGen (mkStdGen 100)
|
setStdGen (mkStdGen 100)
|
||||||
sws <- lines <$> readFile "examples/stopwords"
|
sws <- lines <$> readFile "examples/stopwords"
|
||||||
|
|
||||||
-- real data, currently faces a memory problem
|
-- real data, takes a lot of time to train
|
||||||
{-ds <- do-}
|
{-ds <- do-}
|
||||||
{-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
|
{-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
|
||||||
{-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
|
{-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
|
||||||
|
5639
recurrent.trained
5639
recurrent.trained
File diff suppressed because it is too large
Load Diff
43
sibe.cabal
43
sibe.cabal
@ -1,5 +1,5 @@
|
|||||||
name: sibe
|
name: sibe
|
||||||
version: 0.2.0.1
|
version: 0.2.0.5
|
||||||
synopsis: Machine Learning algorithms
|
synopsis: Machine Learning algorithms
|
||||||
description: Haskell Machine Learning
|
description: Haskell Machine Learning
|
||||||
homepage: https://github.com/mdibaiee/sibe
|
homepage: https://github.com/mdibaiee/sibe
|
||||||
@ -15,13 +15,7 @@ cabal-version: >=1.10
|
|||||||
|
|
||||||
library
|
library
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
exposed-modules: Numeric.Sibe,
|
exposed-modules: Numeric.Sibe, Numeric.Sibe.NaiveBayes, Numeric.Sibe.NLP, Numeric.Sibe.Word2Vec, Numeric.Sibe.Utils
|
||||||
Numeric.Sibe.NaiveBayes,
|
|
||||||
Numeric.Sibe.NLP,
|
|
||||||
Numeric.Sibe.Word2Vec,
|
|
||||||
Numeric.Sibe.Utils,
|
|
||||||
Numeric.Sibe.Recurrent
|
|
||||||
|
|
||||||
build-depends: base >= 4.7 && < 5
|
build-depends: base >= 4.7 && < 5
|
||||||
, hmatrix
|
, hmatrix
|
||||||
, random
|
, random
|
||||||
@ -35,8 +29,8 @@ library
|
|||||||
, vector
|
, vector
|
||||||
, random-shuffle
|
, random-shuffle
|
||||||
, data-default-class
|
, data-default-class
|
||||||
, Chart >= 1.8 && < 2
|
, Chart
|
||||||
, Chart-cairo >= 1.8 && < 2
|
, Chart-cairo
|
||||||
, lens
|
, lens
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
@ -64,21 +58,6 @@ executable example-word2vec
|
|||||||
, random
|
, random
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable example-recurrent
|
|
||||||
hs-source-dirs: examples
|
|
||||||
main-is: recurrent.hs
|
|
||||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2
|
|
||||||
build-depends: base
|
|
||||||
, sibe
|
|
||||||
, hmatrix
|
|
||||||
, data-default-class
|
|
||||||
, split
|
|
||||||
, vector
|
|
||||||
, directory
|
|
||||||
, random
|
|
||||||
, bytestring
|
|
||||||
default-language: Haskell2010
|
|
||||||
|
|
||||||
executable example-424
|
executable example-424
|
||||||
hs-source-dirs: examples
|
hs-source-dirs: examples
|
||||||
main-is: 424encoder.hs
|
main-is: 424encoder.hs
|
||||||
@ -97,8 +76,8 @@ executable example-notmnist
|
|||||||
, sibe
|
, sibe
|
||||||
, hmatrix
|
, hmatrix
|
||||||
, directory >= 1.2.5.0
|
, directory >= 1.2.5.0
|
||||||
, JuicyPixels == 3.2.7.2
|
, JuicyPixels < 3.3
|
||||||
, vector == 0.11.0.0
|
, vector
|
||||||
, random
|
, random
|
||||||
, random-shuffle
|
, random-shuffle
|
||||||
, data-default-class
|
, data-default-class
|
||||||
@ -117,16 +96,6 @@ executable example-naivebayes-doc-classifier
|
|||||||
, split
|
, split
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
test-suite sibe-test
|
|
||||||
type: exitcode-stdio-1.0
|
|
||||||
hs-source-dirs: test
|
|
||||||
main-is: Spec.hs
|
|
||||||
build-depends: base
|
|
||||||
, sibe
|
|
||||||
, hmatrix
|
|
||||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
|
||||||
default-language: Haskell2010
|
|
||||||
|
|
||||||
source-repository head
|
source-repository head
|
||||||
type: git
|
type: git
|
||||||
location: https://github.com/mdibaiee/sibe
|
location: https://github.com/mdibaiee/sibe
|
||||||
|
@ -112,8 +112,8 @@ module Numeric.Sibe.NaiveBayes
|
|||||||
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
|
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
|
||||||
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
||||||
|
|
||||||
{-argmax :: (Ord a) => [a] -> Int-}
|
argmax :: (Ord a) => [a] -> Int
|
||||||
{-argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)-}
|
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
|
||||||
|
|
||||||
mean :: [Double] -> Double
|
mean :: [Double] -> Double
|
||||||
mean x = sum x / genericLength x
|
mean x = sum x / genericLength x
|
||||||
|
@ -1,144 +0,0 @@
|
|||||||
{-# LANGUAGE GADTs #-}
|
|
||||||
{-# LANGUAGE BangPatterns #-}
|
|
||||||
{-# LANGUAGE DataKinds #-}
|
|
||||||
{-# LANGUAGE TypeOperators #-}
|
|
||||||
|
|
||||||
module Numeric.Sibe.Recurrent
|
|
||||||
( Recurrent (..)
|
|
||||||
, randomRecurrent
|
|
||||||
, processData
|
|
||||||
, forward
|
|
||||||
, predict
|
|
||||||
, loss
|
|
||||||
, backprop
|
|
||||||
, sgd
|
|
||||||
) where
|
|
||||||
import Numeric.LinearAlgebra
|
|
||||||
import System.Random
|
|
||||||
import System.Random.Shuffle
|
|
||||||
import Debug.Trace
|
|
||||||
import qualified Data.List as L
|
|
||||||
import Data.Maybe
|
|
||||||
import System.IO
|
|
||||||
import Control.DeepSeq
|
|
||||||
import Control.Monad
|
|
||||||
import qualified Data.Vector.Storable as V
|
|
||||||
import Data.Default.Class
|
|
||||||
|
|
||||||
import qualified Graphics.Rendering.Chart.Easy as Chart
|
|
||||||
import Graphics.Rendering.Chart.Backend.Cairo
|
|
||||||
import Numeric.Sibe.Utils
|
|
||||||
import Debug.Trace
|
|
||||||
|
|
||||||
processData :: [String] -> ([(Int, String)], [[Int]])
|
|
||||||
processData x =
|
|
||||||
let setokens = map (\a -> " <start> " ++ a ++ " <end> ") x
|
|
||||||
tokenized = map tokenize setokens
|
|
||||||
vocabulary = zip [0..] (unique . concat $ tokenized)
|
|
||||||
indexes = map (\a -> fst . fromJust $ L.find ((==a) . snd) vocabulary)
|
|
||||||
in (vocabulary, map indexes tokenized)
|
|
||||||
|
|
||||||
data Recurrent = Recurrent { bpttThreshold :: Int
|
|
||||||
, wordD :: Int
|
|
||||||
, hiddenD :: Int
|
|
||||||
, u :: Matrix Double
|
|
||||||
, v :: Matrix Double
|
|
||||||
, w :: Matrix Double
|
|
||||||
} deriving (Show, Read)
|
|
||||||
instance Default Recurrent where
|
|
||||||
def = Recurrent { bpttThreshold = 3
|
|
||||||
, hiddenD = 100
|
|
||||||
}
|
|
||||||
|
|
||||||
randomRecurrent :: Seed -> Recurrent -> Recurrent
|
|
||||||
randomRecurrent seed r = r { u = randomMatrix (wordD r, hiddenD r) (bounds $ wordD r)
|
|
||||||
, v = randomMatrix (hiddenD r, wordD r) (bounds $ hiddenD r)
|
|
||||||
, w = randomMatrix (hiddenD r, hiddenD r) (bounds $ hiddenD r)
|
|
||||||
}
|
|
||||||
where
|
|
||||||
randomMatrix (wr, wc) (l, u) = uniformSample (seed + wr + wc) wr $ replicate wc (l, u)
|
|
||||||
bounds x = (negate . sqrt $ 1 / fromIntegral x, sqrt $ 1 / fromIntegral x)
|
|
||||||
|
|
||||||
|
|
||||||
forward :: Recurrent -> [Int] -> (Matrix Double, Matrix Double)
|
|
||||||
forward r input =
|
|
||||||
let (h, o) = helper [vector (replicate (hiddenD r) 0)] [] input
|
|
||||||
in (fromRows h, fromRows o)
|
|
||||||
where
|
|
||||||
helper hs os [] = (hs, os)
|
|
||||||
helper (h:hs) os (i:is) =
|
|
||||||
let k = w r #> h
|
|
||||||
newh = V.map tanh $ (u r ! i) + k
|
|
||||||
o = softmax $ newh <# v r
|
|
||||||
in helper (newh:h:hs) (o:os) is
|
|
||||||
|
|
||||||
predict :: Recurrent -> [Int] -> [Int]
|
|
||||||
predict r i =
|
|
||||||
let (_, o) = forward r i
|
|
||||||
in map argmax (toLists o)
|
|
||||||
|
|
||||||
backprop :: Recurrent -> [Int] -> Vector Double -> (Matrix Double, Matrix Double, Matrix Double)
|
|
||||||
backprop r input y =
|
|
||||||
let dU = zero (u r)
|
|
||||||
dV = zero (v r)
|
|
||||||
dW = zero (w r)
|
|
||||||
in bp dU dV dW (zip [0..] input)
|
|
||||||
where
|
|
||||||
(hs, os) = forward r input
|
|
||||||
-- delta
|
|
||||||
dO = fromColumns $ zipWith (\i o -> if i `V.elem` y then o - 1 else o) [0..] (toColumns os)
|
|
||||||
|
|
||||||
bp dU dV dW [] = (dU, dV, dW)
|
|
||||||
bp dU dV dW ((i,x):xs) =
|
|
||||||
let ndV = dV + (hs ! i) `outer` (dO ! i)
|
|
||||||
dT = (v r) #> (dO ! i) -- * (1 - (hs ! i)^2)
|
|
||||||
threshold = bpttThreshold r
|
|
||||||
(ndU, ndW) = tt dU dW dT [max 0 (i-threshold)..i]
|
|
||||||
in bp ndU ndV ndW xs
|
|
||||||
where
|
|
||||||
tt dU dW dT [] = (dU, dW)
|
|
||||||
tt dU dW dT (c:cs) =
|
|
||||||
let ndW = dW + (dT `outer` (hs ! (max 0 $ c - 1)))
|
|
||||||
zdT = vector $ replicate (V.length dT) 0
|
|
||||||
mdT = fromRows $ replicate (max 0 $ c - 1) zdT ++ [dT] ++ replicate (min (rows dU - 1) $ rows dU - c) zdT
|
|
||||||
ndU = dU + mdT
|
|
||||||
ndT = (w r) #> dT
|
|
||||||
in tt ndU ndW ndT cs
|
|
||||||
|
|
||||||
zero m = ((rows m)><(cols m)) $ repeat 0
|
|
||||||
|
|
||||||
{-gradientCheck :: Recurrent -> [Int] -> Vector Double -> Double-}
|
|
||||||
|
|
||||||
sgdStep :: Recurrent -> [Int] -> Vector Double -> Double -> Recurrent
|
|
||||||
sgdStep r input y learningRate =
|
|
||||||
let (dU, dV, dW) = backprop r input y
|
|
||||||
in r { u = (u r) - scale learningRate dU
|
|
||||||
, v = (v r) - scale learningRate dV
|
|
||||||
, w = (w r) - scale learningRate dW
|
|
||||||
}
|
|
||||||
|
|
||||||
sgd :: Recurrent -> [[Int]] -> [Vector Double] -> Double -> Int -> Recurrent
|
|
||||||
sgd r input y learningRate epochs = run [0..epochs] r
|
|
||||||
where
|
|
||||||
run [] r = r
|
|
||||||
run (i:is) r = run is $ train (zip input y) r
|
|
||||||
|
|
||||||
train [] r = r
|
|
||||||
train ((x, y):xs) r = train xs $ sgdStep r x y learningRate
|
|
||||||
|
|
||||||
softmax :: Vector Double -> Vector Double
|
|
||||||
softmax x = cmap (\a -> exp a / s) x
|
|
||||||
where
|
|
||||||
s = V.sum $ exp x
|
|
||||||
|
|
||||||
softmax' :: Vector Double -> Vector Double
|
|
||||||
softmax' = cmap (\a -> sig a * (1 - sig a))
|
|
||||||
where
|
|
||||||
sig x = 1 / max (1 + exp (-x)) 1e-10
|
|
||||||
|
|
||||||
-- cross-entropy
|
|
||||||
loss :: Vector Double -> Vector Double -> Double
|
|
||||||
loss ys os = (-1 / fromIntegral (V.length os)) * V.sum (V.zipWith f os ys)
|
|
||||||
where
|
|
||||||
f a y = y * log (max 1e-10 a)
|
|
||||||
|
|
@ -4,19 +4,10 @@ module Numeric.Sibe.Utils
|
|||||||
, onehot
|
, onehot
|
||||||
, average
|
, average
|
||||||
, pca
|
, pca
|
||||||
, tokenize
|
|
||||||
, frequency
|
|
||||||
, unique
|
|
||||||
, argmax
|
|
||||||
, shape
|
|
||||||
) where
|
) where
|
||||||
import qualified Data.Vector.Storable as V
|
import qualified Data.Vector.Storable as V
|
||||||
import qualified Data.Set as Set
|
import qualified Data.Set as Set
|
||||||
import Numeric.LinearAlgebra
|
import Numeric.LinearAlgebra
|
||||||
import Data.List.Split
|
|
||||||
import Data.Char (isSpace, isNumber, toLower)
|
|
||||||
import Control.Arrow ((&&&))
|
|
||||||
import Data.List
|
|
||||||
|
|
||||||
similarity :: Vector Double -> Vector Double -> Double
|
similarity :: Vector Double -> Vector Double -> Double
|
||||||
similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
|
similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
|
||||||
@ -33,8 +24,6 @@ module Numeric.Sibe.Utils
|
|||||||
go _ [] = []
|
go _ [] = []
|
||||||
go s (x:xs) = if x `Set.member` s then go s xs
|
go s (x:xs) = if x `Set.member` s then go s xs
|
||||||
else x : go (Set.insert x s) xs
|
else x : go (Set.insert x s) xs
|
||||||
unique :: (Ord a) => [a] -> [a]
|
|
||||||
unique = ordNub
|
|
||||||
|
|
||||||
average :: Vector Double -> Vector Double
|
average :: Vector Double -> Vector Double
|
||||||
average v = cmap (/ (V.sum v)) v
|
average v = cmap (/ (V.sum v)) v
|
||||||
@ -50,27 +39,3 @@ module Numeric.Sibe.Utils
|
|||||||
diagS = diagRect 0 s (rows mat) (cols mat)
|
diagS = diagRect 0 s (rows mat) (cols mat)
|
||||||
|
|
||||||
in u ?? (All, Take d) <> diagS ?? (Take d, Take d)
|
in u ?? (All, Take d) <> diagS ?? (Take d, Take d)
|
||||||
|
|
||||||
tokenize :: String -> [String]
|
|
||||||
tokenize str =
|
|
||||||
let spaced = spacify str
|
|
||||||
ws = words spaced
|
|
||||||
in ws
|
|
||||||
where
|
|
||||||
puncs = ['!', '"', '#', '$', '%', '(', ')', '.', '?', ',', '\'', '/', '-']
|
|
||||||
replace needle replacement =
|
|
||||||
concatMap (\c -> if c == needle then replacement else c)
|
|
||||||
spacify = foldl (\acc c -> if c `elem` puncs then acc ++ [' ', c, ' '] else acc ++ [c]) ""
|
|
||||||
|
|
||||||
frequency :: (Ord a) => [a] -> [(a, Int)]
|
|
||||||
frequency = map (head &&& length) . group . sort
|
|
||||||
|
|
||||||
argmax :: (Foldable t, Num a, Fractional a, Ord a) => t a -> Int
|
|
||||||
argmax v = snd $ foldl mx ((-1/0), 0) v
|
|
||||||
where
|
|
||||||
mx (a, i) b
|
|
||||||
| b > a = (b, i + 1)
|
|
||||||
| otherwise = (a, i)
|
|
||||||
|
|
||||||
shape :: Matrix a -> (Int, Int)
|
|
||||||
shape x = (rows x, cols x)
|
|
||||||
|
@ -1,8 +0,0 @@
|
|||||||
module Main where
|
|
||||||
import System.Exit (exitFailure)
|
|
||||||
|
|
||||||
import Sibe
|
|
||||||
|
|
||||||
main = do
|
|
||||||
putStrLn "Hey"
|
|
||||||
exitFailure
|
|
Loading…
Reference in New Issue
Block a user