Compare commits
10 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
d22d91bd6a | ||
|
83f9ff1c44 | ||
|
fd987ad490 | ||
|
172875e4c2 | ||
|
58d62a06fc | ||
|
854bfe5805 | ||
|
b382d63994 | ||
|
b7b62223d6 | ||
|
94e11e5b7c | ||
|
fa84c7efe9 |
6
LICENSE
6
LICENSE
@ -631,8 +631,8 @@ to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
{one line to give the program's name and a brief idea of what it does.}
|
||||
Copyright (C) {year} {name of author}
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@ -652,7 +652,7 @@ Also add information on how to contact you by electronic and paper mail.
|
||||
If the program does terminal interaction, make it output a short
|
||||
notice like this when it starts in an interactive mode:
|
||||
|
||||
Sibe Copyright (C) 2016 Mahdi Dibaiee
|
||||
<program> Copyright (C) <year> <name of author>
|
||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
@ -1,60 +0,0 @@
|
||||
{-# LANGUAGE RecordWildCards #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE ScopedTypeVariables #-}
|
||||
|
||||
module Main where
|
||||
import Numeric.LinearAlgebra
|
||||
import Numeric.Sibe.Recurrent
|
||||
import Numeric.Sibe.Utils
|
||||
import System.IO
|
||||
import Data.Default.Class
|
||||
import Data.List (genericLength)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BL
|
||||
|
||||
main = do
|
||||
texts <- lines <$> readFile "examples/reddit.csv"
|
||||
let (vocabulary, indexes) = processData texts
|
||||
|
||||
let settings = def { wordD = length vocabulary }
|
||||
r = randomRecurrent 0 settings
|
||||
|
||||
let x0 = reverse . drop 1 . reverse $ indexes !! 0
|
||||
y0 = drop 1 $ indexes !! 0
|
||||
|
||||
let xs = map (reverse . drop 1 . reverse) indexes
|
||||
ys = map (drop 1) indexes
|
||||
|
||||
let tov = fromList . map fromIntegral
|
||||
let vys = map tov ys
|
||||
|
||||
let newr = sgd r (take 1 xs) (take 1 vys) 0.005 1
|
||||
|
||||
let newpredicted = predict newr x0
|
||||
print $ y0
|
||||
print $ newpredicted
|
||||
|
||||
print $ loss (tov y0) (tov newpredicted)
|
||||
|
||||
print "done"
|
||||
|
||||
saveRecurrent "recurrent.trained" (show newr) 512
|
||||
|
||||
saveRecurrent :: FilePath -> String -> Int -> IO ()
|
||||
saveRecurrent path str chunkSize = do
|
||||
let b = BL.pack str
|
||||
withFile path AppendMode (process b)
|
||||
where
|
||||
process :: BL.ByteString -> Handle -> IO ()
|
||||
process b handle = do
|
||||
hSetBuffering handle NoBuffering
|
||||
loop handle b
|
||||
|
||||
loop :: Handle -> BL.ByteString -> IO ()
|
||||
loop handle s
|
||||
| s == BL.empty = return ()
|
||||
| otherwise = do
|
||||
let (current, next) = BL.splitAt (fromIntegral chunkSize) s
|
||||
BL.hPutStr handle current
|
||||
hFlush handle
|
||||
loop handle next
|
||||
|
20087
examples/reddit.csv
20087
examples/reddit.csv
File diff suppressed because it is too large
Load Diff
@ -31,7 +31,7 @@ module Main where
|
||||
setStdGen (mkStdGen 100)
|
||||
sws <- lines <$> readFile "examples/stopwords"
|
||||
|
||||
-- real data, currently faces a memory problem
|
||||
-- real data, takes a lot of time to train
|
||||
{-ds <- do-}
|
||||
{-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
|
||||
{-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
|
||||
|
5639
recurrent.trained
5639
recurrent.trained
File diff suppressed because it is too large
Load Diff
43
sibe.cabal
43
sibe.cabal
@ -1,5 +1,5 @@
|
||||
name: sibe
|
||||
version: 0.2.0.1
|
||||
version: 0.2.0.5
|
||||
synopsis: Machine Learning algorithms
|
||||
description: Haskell Machine Learning
|
||||
homepage: https://github.com/mdibaiee/sibe
|
||||
@ -15,13 +15,7 @@ cabal-version: >=1.10
|
||||
|
||||
library
|
||||
hs-source-dirs: src
|
||||
exposed-modules: Numeric.Sibe,
|
||||
Numeric.Sibe.NaiveBayes,
|
||||
Numeric.Sibe.NLP,
|
||||
Numeric.Sibe.Word2Vec,
|
||||
Numeric.Sibe.Utils,
|
||||
Numeric.Sibe.Recurrent
|
||||
|
||||
exposed-modules: Numeric.Sibe, Numeric.Sibe.NaiveBayes, Numeric.Sibe.NLP, Numeric.Sibe.Word2Vec, Numeric.Sibe.Utils
|
||||
build-depends: base >= 4.7 && < 5
|
||||
, hmatrix
|
||||
, random
|
||||
@ -35,8 +29,8 @@ library
|
||||
, vector
|
||||
, random-shuffle
|
||||
, data-default-class
|
||||
, Chart >= 1.8 && < 2
|
||||
, Chart-cairo >= 1.8 && < 2
|
||||
, Chart
|
||||
, Chart-cairo
|
||||
, lens
|
||||
default-language: Haskell2010
|
||||
|
||||
@ -64,21 +58,6 @@ executable example-word2vec
|
||||
, random
|
||||
default-language: Haskell2010
|
||||
|
||||
executable example-recurrent
|
||||
hs-source-dirs: examples
|
||||
main-is: recurrent.hs
|
||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2
|
||||
build-depends: base
|
||||
, sibe
|
||||
, hmatrix
|
||||
, data-default-class
|
||||
, split
|
||||
, vector
|
||||
, directory
|
||||
, random
|
||||
, bytestring
|
||||
default-language: Haskell2010
|
||||
|
||||
executable example-424
|
||||
hs-source-dirs: examples
|
||||
main-is: 424encoder.hs
|
||||
@ -97,8 +76,8 @@ executable example-notmnist
|
||||
, sibe
|
||||
, hmatrix
|
||||
, directory >= 1.2.5.0
|
||||
, JuicyPixels == 3.2.7.2
|
||||
, vector == 0.11.0.0
|
||||
, JuicyPixels < 3.3
|
||||
, vector
|
||||
, random
|
||||
, random-shuffle
|
||||
, data-default-class
|
||||
@ -117,16 +96,6 @@ executable example-naivebayes-doc-classifier
|
||||
, split
|
||||
default-language: Haskell2010
|
||||
|
||||
test-suite sibe-test
|
||||
type: exitcode-stdio-1.0
|
||||
hs-source-dirs: test
|
||||
main-is: Spec.hs
|
||||
build-depends: base
|
||||
, sibe
|
||||
, hmatrix
|
||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||
default-language: Haskell2010
|
||||
|
||||
source-repository head
|
||||
type: git
|
||||
location: https://github.com/mdibaiee/sibe
|
||||
|
@ -112,8 +112,8 @@ module Numeric.Sibe.NaiveBayes
|
||||
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
|
||||
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
|
||||
|
||||
{-argmax :: (Ord a) => [a] -> Int-}
|
||||
{-argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)-}
|
||||
argmax :: (Ord a) => [a] -> Int
|
||||
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
|
||||
|
||||
mean :: [Double] -> Double
|
||||
mean x = sum x / genericLength x
|
||||
|
@ -1,144 +0,0 @@
|
||||
{-# LANGUAGE GADTs #-}
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
{-# LANGUAGE DataKinds #-}
|
||||
{-# LANGUAGE TypeOperators #-}
|
||||
|
||||
module Numeric.Sibe.Recurrent
|
||||
( Recurrent (..)
|
||||
, randomRecurrent
|
||||
, processData
|
||||
, forward
|
||||
, predict
|
||||
, loss
|
||||
, backprop
|
||||
, sgd
|
||||
) where
|
||||
import Numeric.LinearAlgebra
|
||||
import System.Random
|
||||
import System.Random.Shuffle
|
||||
import Debug.Trace
|
||||
import qualified Data.List as L
|
||||
import Data.Maybe
|
||||
import System.IO
|
||||
import Control.DeepSeq
|
||||
import Control.Monad
|
||||
import qualified Data.Vector.Storable as V
|
||||
import Data.Default.Class
|
||||
|
||||
import qualified Graphics.Rendering.Chart.Easy as Chart
|
||||
import Graphics.Rendering.Chart.Backend.Cairo
|
||||
import Numeric.Sibe.Utils
|
||||
import Debug.Trace
|
||||
|
||||
processData :: [String] -> ([(Int, String)], [[Int]])
|
||||
processData x =
|
||||
let setokens = map (\a -> " <start> " ++ a ++ " <end> ") x
|
||||
tokenized = map tokenize setokens
|
||||
vocabulary = zip [0..] (unique . concat $ tokenized)
|
||||
indexes = map (\a -> fst . fromJust $ L.find ((==a) . snd) vocabulary)
|
||||
in (vocabulary, map indexes tokenized)
|
||||
|
||||
data Recurrent = Recurrent { bpttThreshold :: Int
|
||||
, wordD :: Int
|
||||
, hiddenD :: Int
|
||||
, u :: Matrix Double
|
||||
, v :: Matrix Double
|
||||
, w :: Matrix Double
|
||||
} deriving (Show, Read)
|
||||
instance Default Recurrent where
|
||||
def = Recurrent { bpttThreshold = 3
|
||||
, hiddenD = 100
|
||||
}
|
||||
|
||||
randomRecurrent :: Seed -> Recurrent -> Recurrent
|
||||
randomRecurrent seed r = r { u = randomMatrix (wordD r, hiddenD r) (bounds $ wordD r)
|
||||
, v = randomMatrix (hiddenD r, wordD r) (bounds $ hiddenD r)
|
||||
, w = randomMatrix (hiddenD r, hiddenD r) (bounds $ hiddenD r)
|
||||
}
|
||||
where
|
||||
randomMatrix (wr, wc) (l, u) = uniformSample (seed + wr + wc) wr $ replicate wc (l, u)
|
||||
bounds x = (negate . sqrt $ 1 / fromIntegral x, sqrt $ 1 / fromIntegral x)
|
||||
|
||||
|
||||
forward :: Recurrent -> [Int] -> (Matrix Double, Matrix Double)
|
||||
forward r input =
|
||||
let (h, o) = helper [vector (replicate (hiddenD r) 0)] [] input
|
||||
in (fromRows h, fromRows o)
|
||||
where
|
||||
helper hs os [] = (hs, os)
|
||||
helper (h:hs) os (i:is) =
|
||||
let k = w r #> h
|
||||
newh = V.map tanh $ (u r ! i) + k
|
||||
o = softmax $ newh <# v r
|
||||
in helper (newh:h:hs) (o:os) is
|
||||
|
||||
predict :: Recurrent -> [Int] -> [Int]
|
||||
predict r i =
|
||||
let (_, o) = forward r i
|
||||
in map argmax (toLists o)
|
||||
|
||||
backprop :: Recurrent -> [Int] -> Vector Double -> (Matrix Double, Matrix Double, Matrix Double)
|
||||
backprop r input y =
|
||||
let dU = zero (u r)
|
||||
dV = zero (v r)
|
||||
dW = zero (w r)
|
||||
in bp dU dV dW (zip [0..] input)
|
||||
where
|
||||
(hs, os) = forward r input
|
||||
-- delta
|
||||
dO = fromColumns $ zipWith (\i o -> if i `V.elem` y then o - 1 else o) [0..] (toColumns os)
|
||||
|
||||
bp dU dV dW [] = (dU, dV, dW)
|
||||
bp dU dV dW ((i,x):xs) =
|
||||
let ndV = dV + (hs ! i) `outer` (dO ! i)
|
||||
dT = (v r) #> (dO ! i) -- * (1 - (hs ! i)^2)
|
||||
threshold = bpttThreshold r
|
||||
(ndU, ndW) = tt dU dW dT [max 0 (i-threshold)..i]
|
||||
in bp ndU ndV ndW xs
|
||||
where
|
||||
tt dU dW dT [] = (dU, dW)
|
||||
tt dU dW dT (c:cs) =
|
||||
let ndW = dW + (dT `outer` (hs ! (max 0 $ c - 1)))
|
||||
zdT = vector $ replicate (V.length dT) 0
|
||||
mdT = fromRows $ replicate (max 0 $ c - 1) zdT ++ [dT] ++ replicate (min (rows dU - 1) $ rows dU - c) zdT
|
||||
ndU = dU + mdT
|
||||
ndT = (w r) #> dT
|
||||
in tt ndU ndW ndT cs
|
||||
|
||||
zero m = ((rows m)><(cols m)) $ repeat 0
|
||||
|
||||
{-gradientCheck :: Recurrent -> [Int] -> Vector Double -> Double-}
|
||||
|
||||
sgdStep :: Recurrent -> [Int] -> Vector Double -> Double -> Recurrent
|
||||
sgdStep r input y learningRate =
|
||||
let (dU, dV, dW) = backprop r input y
|
||||
in r { u = (u r) - scale learningRate dU
|
||||
, v = (v r) - scale learningRate dV
|
||||
, w = (w r) - scale learningRate dW
|
||||
}
|
||||
|
||||
sgd :: Recurrent -> [[Int]] -> [Vector Double] -> Double -> Int -> Recurrent
|
||||
sgd r input y learningRate epochs = run [0..epochs] r
|
||||
where
|
||||
run [] r = r
|
||||
run (i:is) r = run is $ train (zip input y) r
|
||||
|
||||
train [] r = r
|
||||
train ((x, y):xs) r = train xs $ sgdStep r x y learningRate
|
||||
|
||||
softmax :: Vector Double -> Vector Double
|
||||
softmax x = cmap (\a -> exp a / s) x
|
||||
where
|
||||
s = V.sum $ exp x
|
||||
|
||||
softmax' :: Vector Double -> Vector Double
|
||||
softmax' = cmap (\a -> sig a * (1 - sig a))
|
||||
where
|
||||
sig x = 1 / max (1 + exp (-x)) 1e-10
|
||||
|
||||
-- cross-entropy
|
||||
loss :: Vector Double -> Vector Double -> Double
|
||||
loss ys os = (-1 / fromIntegral (V.length os)) * V.sum (V.zipWith f os ys)
|
||||
where
|
||||
f a y = y * log (max 1e-10 a)
|
||||
|
@ -4,19 +4,10 @@ module Numeric.Sibe.Utils
|
||||
, onehot
|
||||
, average
|
||||
, pca
|
||||
, tokenize
|
||||
, frequency
|
||||
, unique
|
||||
, argmax
|
||||
, shape
|
||||
) where
|
||||
import qualified Data.Vector.Storable as V
|
||||
import qualified Data.Set as Set
|
||||
import Numeric.LinearAlgebra
|
||||
import Data.List.Split
|
||||
import Data.Char (isSpace, isNumber, toLower)
|
||||
import Control.Arrow ((&&&))
|
||||
import Data.List
|
||||
|
||||
similarity :: Vector Double -> Vector Double -> Double
|
||||
similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
|
||||
@ -33,8 +24,6 @@ module Numeric.Sibe.Utils
|
||||
go _ [] = []
|
||||
go s (x:xs) = if x `Set.member` s then go s xs
|
||||
else x : go (Set.insert x s) xs
|
||||
unique :: (Ord a) => [a] -> [a]
|
||||
unique = ordNub
|
||||
|
||||
average :: Vector Double -> Vector Double
|
||||
average v = cmap (/ (V.sum v)) v
|
||||
@ -50,27 +39,3 @@ module Numeric.Sibe.Utils
|
||||
diagS = diagRect 0 s (rows mat) (cols mat)
|
||||
|
||||
in u ?? (All, Take d) <> diagS ?? (Take d, Take d)
|
||||
|
||||
tokenize :: String -> [String]
|
||||
tokenize str =
|
||||
let spaced = spacify str
|
||||
ws = words spaced
|
||||
in ws
|
||||
where
|
||||
puncs = ['!', '"', '#', '$', '%', '(', ')', '.', '?', ',', '\'', '/', '-']
|
||||
replace needle replacement =
|
||||
concatMap (\c -> if c == needle then replacement else c)
|
||||
spacify = foldl (\acc c -> if c `elem` puncs then acc ++ [' ', c, ' '] else acc ++ [c]) ""
|
||||
|
||||
frequency :: (Ord a) => [a] -> [(a, Int)]
|
||||
frequency = map (head &&& length) . group . sort
|
||||
|
||||
argmax :: (Foldable t, Num a, Fractional a, Ord a) => t a -> Int
|
||||
argmax v = snd $ foldl mx ((-1/0), 0) v
|
||||
where
|
||||
mx (a, i) b
|
||||
| b > a = (b, i + 1)
|
||||
| otherwise = (a, i)
|
||||
|
||||
shape :: Matrix a -> (Int, Int)
|
||||
shape x = (rows x, cols x)
|
||||
|
@ -1,8 +0,0 @@
|
||||
module Main where
|
||||
import System.Exit (exitFailure)
|
||||
|
||||
import Sibe
|
||||
|
||||
main = do
|
||||
putStrLn "Hey"
|
||||
exitFailure
|
Loading…
Reference in New Issue
Block a user