Compare commits

..

10 Commits
rnn ... master

Author SHA1 Message Date
Mahdi Dibaiee
d22d91bd6a Merge branch 'master' of github.com:mdibaiee/sibe 2017-05-25 12:05:31 +04:30
Mahdi Dibaiee
83f9ff1c44 chore: bump version 2017-05-25 12:04:34 +04:30
Mahdi Dibaiee
fd987ad490 Merge pull request #3 from peti/master
Distribute an *unmodified* GPL-3 license text.
2017-04-01 12:51:17 -04:00
Peter Simons
172875e4c2 Distribute an *unmodified* GPL-3 license text.
If you modify the text of the license in whatever trivial way, then the
resulting license may no longer be called "GPL-3". That is probably not what
you intended.
2017-03-31 16:04:23 +02:00
Mahdi Dibaiee
58d62a06fc chore: bump version 2017-01-08 15:09:32 +03:30
Mahdi Dibaiee
854bfe5805 chore(dependencies): update vector 2017-01-08 15:06:35 +03:30
Mahdi Dibaiee
b382d63994 remove test suite 2016-11-09 11:22:33 +03:30
Mahdi Dibaiee
b7b62223d6 chore: bump version 2016-11-08 22:42:37 +03:30
Mahdi Dibaiee
94e11e5b7c chore: bump version 2016-11-08 22:40:26 +03:30
Mahdi Dibaiee
fa84c7efe9 chore(deps): JuicyPixel < 3.3 2016-11-08 22:39:50 +03:30
10 changed files with 12 additions and 26016 deletions

View File

@ -631,8 +631,8 @@ to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
{one line to give the program's name and a brief idea of what it does.}
Copyright (C) {year} {name of author}
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -652,7 +652,7 @@ Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
Sibe Copyright (C) 2016 Mahdi Dibaiee
<program> Copyright (C) <year> <name of author>
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.

View File

@ -1,60 +0,0 @@
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Main where
import Numeric.LinearAlgebra
import Numeric.Sibe.Recurrent
import Numeric.Sibe.Utils
import System.IO
import Data.Default.Class
import Data.List (genericLength)
import qualified Data.ByteString.Lazy.Char8 as BL
main = do
texts <- lines <$> readFile "examples/reddit.csv"
let (vocabulary, indexes) = processData texts
let settings = def { wordD = length vocabulary }
r = randomRecurrent 0 settings
let x0 = reverse . drop 1 . reverse $ indexes !! 0
y0 = drop 1 $ indexes !! 0
let xs = map (reverse . drop 1 . reverse) indexes
ys = map (drop 1) indexes
let tov = fromList . map fromIntegral
let vys = map tov ys
let newr = sgd r (take 1 xs) (take 1 vys) 0.005 1
let newpredicted = predict newr x0
print $ y0
print $ newpredicted
print $ loss (tov y0) (tov newpredicted)
print "done"
saveRecurrent "recurrent.trained" (show newr) 512
saveRecurrent :: FilePath -> String -> Int -> IO ()
saveRecurrent path str chunkSize = do
let b = BL.pack str
withFile path AppendMode (process b)
where
process :: BL.ByteString -> Handle -> IO ()
process b handle = do
hSetBuffering handle NoBuffering
loop handle b
loop :: Handle -> BL.ByteString -> IO ()
loop handle s
| s == BL.empty = return ()
| otherwise = do
let (current, next) = BL.splitAt (fromIntegral chunkSize) s
BL.hPutStr handle current
hFlush handle
loop handle next

File diff suppressed because it is too large Load Diff

View File

@ -31,7 +31,7 @@ module Main where
setStdGen (mkStdGen 100)
sws <- lines <$> readFile "examples/stopwords"
-- real data, currently faces a memory problem
-- real data, takes a lot of time to train
{-ds <- do-}
{-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
{-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
name: sibe
version: 0.2.0.1
version: 0.2.0.5
synopsis: Machine Learning algorithms
description: Haskell Machine Learning
homepage: https://github.com/mdibaiee/sibe
@ -15,13 +15,7 @@ cabal-version: >=1.10
library
hs-source-dirs: src
exposed-modules: Numeric.Sibe,
Numeric.Sibe.NaiveBayes,
Numeric.Sibe.NLP,
Numeric.Sibe.Word2Vec,
Numeric.Sibe.Utils,
Numeric.Sibe.Recurrent
exposed-modules: Numeric.Sibe, Numeric.Sibe.NaiveBayes, Numeric.Sibe.NLP, Numeric.Sibe.Word2Vec, Numeric.Sibe.Utils
build-depends: base >= 4.7 && < 5
, hmatrix
, random
@ -35,8 +29,8 @@ library
, vector
, random-shuffle
, data-default-class
, Chart >= 1.8 && < 2
, Chart-cairo >= 1.8 && < 2
, Chart
, Chart-cairo
, lens
default-language: Haskell2010
@ -64,21 +58,6 @@ executable example-word2vec
, random
default-language: Haskell2010
executable example-recurrent
hs-source-dirs: examples
main-is: recurrent.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2
build-depends: base
, sibe
, hmatrix
, data-default-class
, split
, vector
, directory
, random
, bytestring
default-language: Haskell2010
executable example-424
hs-source-dirs: examples
main-is: 424encoder.hs
@ -97,8 +76,8 @@ executable example-notmnist
, sibe
, hmatrix
, directory >= 1.2.5.0
, JuicyPixels == 3.2.7.2
, vector == 0.11.0.0
, JuicyPixels < 3.3
, vector
, random
, random-shuffle
, data-default-class
@ -117,16 +96,6 @@ executable example-naivebayes-doc-classifier
, split
default-language: Haskell2010
test-suite sibe-test
type: exitcode-stdio-1.0
hs-source-dirs: test
main-is: Spec.hs
build-depends: base
, sibe
, hmatrix
ghc-options: -threaded -rtsopts -with-rtsopts=-N
default-language: Haskell2010
source-repository head
type: git
location: https://github.com/mdibaiee/sibe

View File

@ -112,8 +112,8 @@ module Numeric.Sibe.NaiveBayes
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
{-argmax :: (Ord a) => [a] -> Int-}
{-argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)-}
argmax :: (Ord a) => [a] -> Int
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
mean :: [Double] -> Double
mean x = sum x / genericLength x

View File

@ -1,144 +0,0 @@
{-# LANGUAGE GADTs #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE TypeOperators #-}
module Numeric.Sibe.Recurrent
( Recurrent (..)
, randomRecurrent
, processData
, forward
, predict
, loss
, backprop
, sgd
) where
import Numeric.LinearAlgebra
import System.Random
import System.Random.Shuffle
import Debug.Trace
import qualified Data.List as L
import Data.Maybe
import System.IO
import Control.DeepSeq
import Control.Monad
import qualified Data.Vector.Storable as V
import Data.Default.Class
import qualified Graphics.Rendering.Chart.Easy as Chart
import Graphics.Rendering.Chart.Backend.Cairo
import Numeric.Sibe.Utils
import Debug.Trace
processData :: [String] -> ([(Int, String)], [[Int]])
processData x =
let setokens = map (\a -> " <start> " ++ a ++ " <end> ") x
tokenized = map tokenize setokens
vocabulary = zip [0..] (unique . concat $ tokenized)
indexes = map (\a -> fst . fromJust $ L.find ((==a) . snd) vocabulary)
in (vocabulary, map indexes tokenized)
data Recurrent = Recurrent { bpttThreshold :: Int
, wordD :: Int
, hiddenD :: Int
, u :: Matrix Double
, v :: Matrix Double
, w :: Matrix Double
} deriving (Show, Read)
instance Default Recurrent where
def = Recurrent { bpttThreshold = 3
, hiddenD = 100
}
randomRecurrent :: Seed -> Recurrent -> Recurrent
randomRecurrent seed r = r { u = randomMatrix (wordD r, hiddenD r) (bounds $ wordD r)
, v = randomMatrix (hiddenD r, wordD r) (bounds $ hiddenD r)
, w = randomMatrix (hiddenD r, hiddenD r) (bounds $ hiddenD r)
}
where
randomMatrix (wr, wc) (l, u) = uniformSample (seed + wr + wc) wr $ replicate wc (l, u)
bounds x = (negate . sqrt $ 1 / fromIntegral x, sqrt $ 1 / fromIntegral x)
forward :: Recurrent -> [Int] -> (Matrix Double, Matrix Double)
forward r input =
let (h, o) = helper [vector (replicate (hiddenD r) 0)] [] input
in (fromRows h, fromRows o)
where
helper hs os [] = (hs, os)
helper (h:hs) os (i:is) =
let k = w r #> h
newh = V.map tanh $ (u r ! i) + k
o = softmax $ newh <# v r
in helper (newh:h:hs) (o:os) is
predict :: Recurrent -> [Int] -> [Int]
predict r i =
let (_, o) = forward r i
in map argmax (toLists o)
backprop :: Recurrent -> [Int] -> Vector Double -> (Matrix Double, Matrix Double, Matrix Double)
backprop r input y =
let dU = zero (u r)
dV = zero (v r)
dW = zero (w r)
in bp dU dV dW (zip [0..] input)
where
(hs, os) = forward r input
-- delta
dO = fromColumns $ zipWith (\i o -> if i `V.elem` y then o - 1 else o) [0..] (toColumns os)
bp dU dV dW [] = (dU, dV, dW)
bp dU dV dW ((i,x):xs) =
let ndV = dV + (hs ! i) `outer` (dO ! i)
dT = (v r) #> (dO ! i) -- * (1 - (hs ! i)^2)
threshold = bpttThreshold r
(ndU, ndW) = tt dU dW dT [max 0 (i-threshold)..i]
in bp ndU ndV ndW xs
where
tt dU dW dT [] = (dU, dW)
tt dU dW dT (c:cs) =
let ndW = dW + (dT `outer` (hs ! (max 0 $ c - 1)))
zdT = vector $ replicate (V.length dT) 0
mdT = fromRows $ replicate (max 0 $ c - 1) zdT ++ [dT] ++ replicate (min (rows dU - 1) $ rows dU - c) zdT
ndU = dU + mdT
ndT = (w r) #> dT
in tt ndU ndW ndT cs
zero m = ((rows m)><(cols m)) $ repeat 0
{-gradientCheck :: Recurrent -> [Int] -> Vector Double -> Double-}
sgdStep :: Recurrent -> [Int] -> Vector Double -> Double -> Recurrent
sgdStep r input y learningRate =
let (dU, dV, dW) = backprop r input y
in r { u = (u r) - scale learningRate dU
, v = (v r) - scale learningRate dV
, w = (w r) - scale learningRate dW
}
sgd :: Recurrent -> [[Int]] -> [Vector Double] -> Double -> Int -> Recurrent
sgd r input y learningRate epochs = run [0..epochs] r
where
run [] r = r
run (i:is) r = run is $ train (zip input y) r
train [] r = r
train ((x, y):xs) r = train xs $ sgdStep r x y learningRate
softmax :: Vector Double -> Vector Double
softmax x = cmap (\a -> exp a / s) x
where
s = V.sum $ exp x
softmax' :: Vector Double -> Vector Double
softmax' = cmap (\a -> sig a * (1 - sig a))
where
sig x = 1 / max (1 + exp (-x)) 1e-10
-- cross-entropy
loss :: Vector Double -> Vector Double -> Double
loss ys os = (-1 / fromIntegral (V.length os)) * V.sum (V.zipWith f os ys)
where
f a y = y * log (max 1e-10 a)

View File

@ -4,19 +4,10 @@ module Numeric.Sibe.Utils
, onehot
, average
, pca
, tokenize
, frequency
, unique
, argmax
, shape
) where
import qualified Data.Vector.Storable as V
import qualified Data.Set as Set
import Numeric.LinearAlgebra
import Data.List.Split
import Data.Char (isSpace, isNumber, toLower)
import Control.Arrow ((&&&))
import Data.List
similarity :: Vector Double -> Vector Double -> Double
similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
@ -33,8 +24,6 @@ module Numeric.Sibe.Utils
go _ [] = []
go s (x:xs) = if x `Set.member` s then go s xs
else x : go (Set.insert x s) xs
unique :: (Ord a) => [a] -> [a]
unique = ordNub
average :: Vector Double -> Vector Double
average v = cmap (/ (V.sum v)) v
@ -50,27 +39,3 @@ module Numeric.Sibe.Utils
diagS = diagRect 0 s (rows mat) (cols mat)
in u ?? (All, Take d) <> diagS ?? (Take d, Take d)
tokenize :: String -> [String]
tokenize str =
let spaced = spacify str
ws = words spaced
in ws
where
puncs = ['!', '"', '#', '$', '%', '(', ')', '.', '?', ',', '\'', '/', '-']
replace needle replacement =
concatMap (\c -> if c == needle then replacement else c)
spacify = foldl (\acc c -> if c `elem` puncs then acc ++ [' ', c, ' '] else acc ++ [c]) ""
frequency :: (Ord a) => [a] -> [(a, Int)]
frequency = map (head &&& length) . group . sort
argmax :: (Foldable t, Num a, Fractional a, Ord a) => t a -> Int
argmax v = snd $ foldl mx ((-1/0), 0) v
where
mx (a, i) b
| b > a = (b, i + 1)
| otherwise = (a, i)
shape :: Matrix a -> (Int, Int)
shape x = (rows x, cols x)

View File

@ -1,8 +0,0 @@
module Main where
import System.Exit (exitFailure)
import Sibe
main = do
putStrLn "Hey"
exitFailure