feat(notmnist): notmnist example using SGD + learning rate decay

This commit is contained in:
Mahdi Dibaiee 2016-09-10 00:36:15 +04:30
parent ace0a18653
commit b26347e19f
21 changed files with 619 additions and 320 deletions

3
.gitmodules vendored
View File

@ -1,3 +1,6 @@
[submodule "examples/doc-classifier-data"] [submodule "examples/doc-classifier-data"]
path = examples/doc-classifier-data path = examples/doc-classifier-data
url = git@github.com:mdibaiee/doc-classifier-data url = git@github.com:mdibaiee/doc-classifier-data
[submodule "examples/notMNIST"]
path = examples/notMNIST
url = git@github.com:mdibaiee/notMNIST

49
examples/424encoder.hs Normal file
View File

@ -0,0 +1,49 @@
module Main where
import Sibe
import Numeric.LinearAlgebra
import Data.List
import Debug.Trace
import Data.Default.Class
main = do
let alpha = 0.5
epochs = 1000
a = (sigmoid, sigmoid')
rnetwork = randomNetwork 0 (-0.1, 0.1) 4 [(2, a)] (4, a)
inputs = [vector [1, 0, 0, 0],
vector [0, 1, 0, 0],
vector [0, 0, 1, 0],
vector [0, 0, 0, 1]]
labels = [vector [1, 0, 0, 0],
vector [0, 1, 0, 0],
vector [0, 0, 1, 0],
vector [0, 0, 0, 1]]
session = def { network = rnetwork
, learningRate = 0.5
, epochs = 1000
, training = zip inputs labels
, test = zip inputs labels
} :: Session
let initialCost = crossEntropy session
newsession <- run gd session
let results = map (`forward` newsession) inputs
rounded = map (map round . toList) results
cost = crossEntropy newsession
putStrLn "parameters: "
putStrLn $ "- inputs: " ++ show inputs
putStrLn $ "- labels: " ++ show labels
putStrLn $ "- learning rate: " ++ show alpha
putStrLn $ "- epochs: " ++ show epochs
putStrLn $ "- initial cost (cross-entropy): " ++ show initialCost
putStrLn "results: "
putStrLn $ "- actual result: " ++ show results
putStrLn $ "- rounded result: " ++ show rounded
putStrLn $ "- cost (cross-entropy): " ++ show cost

View File

@ -1,6 +1,7 @@
module Main module Main
where where
-- import Sibe -- import Sibe
import Sibe.NLP
import Sibe.NaiveBayes import Sibe.NaiveBayes
import Text.Printf import Text.Printf
import Data.List import Data.List
@ -28,14 +29,14 @@ module Main
documents = cleanDocuments . removeWords sws $ createDocuments classes dataset documents = cleanDocuments . removeWords sws $ createDocuments classes dataset
testDocuments = cleanDocuments $ createDocuments classes test testDocuments = cleanDocuments $ createDocuments classes test
nb = train documents intClasses nb = initialize documents intClasses
-- top-ten -- top-ten
topClasses = take 10 . reverse $ sortBy (compare `on` (length . snd)) (cd nb) topClasses = take 10 . reverse $ sortBy (compare `on` (length . snd)) (cd nb)
filtered = map (\(c, ds) -> (c, take 100 ds)) topClasses filtered = map (\(c, ds) -> (c, take 100 ds)) topClasses
filteredClasses = map fst filtered filteredClasses = map fst filtered
ttDocs = concatMap snd filtered ttDocs = concatMap snd filtered
ttNB = train ttDocs filteredClasses ttNB = initialize ttDocs filteredClasses
ttTestDocuments = filter ((`elem` filteredClasses) . c) . cleanDocuments $ createDocuments classes test ttTestDocuments = filter ((`elem` filteredClasses) . c) . cleanDocuments $ createDocuments classes test

View File

@ -1,54 +0,0 @@
{-# LANGUAGE BangPatterns #-}
module Main
where
-- import Sibe
import Sibe.NaiveBayes
import Text.Printf
import Data.List
import Data.Maybe
import Debug.Trace
import Data.List.Split
import System.Directory
import Control.DeepSeq
import System.IO
main = do
putStr "Reading documents... "
neg_documents <- createDocuments "examples/sentiment-analysis-data/train/neg/"
pos_documents <- createDocuments "examples/sentiment-analysis-data/train/pos/"
test_neg_documents <- createDocuments "examples/sentiment-analysis-data/test/neg/"
test_pos_documents <- createDocuments "examples/sentiment-analysis-data/test/pos/"
putStrLn "done"
let classes = [0..9] -- rating, from 0 to 9 (1 to 10)
documents = neg_documents ++ pos_documents
nb = train documents classes
testDocuments = neg_documents ++ pos_documents
results = map (\(Document text c) -> (c, run text nb)) testDocuments
-- results = map (\(Document text c) -> (c, determine text nb intClasses documents)) devTestDocuments
print results
-- let showResults (c, r) = putStrLn (show (classes !! c) ++ " ~ " ++ show (classes !! r))
-- mapM_ showResults results
--
-- putStrLn $ "Recall: " ++ show (recall results)
-- putStrLn $ "Precision: " ++ show (precision results)
-- putStrLn $ "F Measure: " ++ show (fmeasure results)
-- putStrLn $ "Accuracy: " ++ show (accuracy results)
createDocuments :: FilePath -> IO [Document]
createDocuments path = do
files <- drop 2 <$> getDirectoryContents path
let ratings = map (subtract 1 . read . take 1 . last . splitOn "_") files :: [Int]
contents <- mapM (forceReadFile . (path ++)) files
return $ zipWith Document contents ratings
forceReadFile :: FilePath -> IO String
forceReadFile file = do
handle <- openFile file ReadMode
content <- hGetContents handle
content `deepseq` hClose handle
return content

1
examples/notMNIST Submodule

@ -0,0 +1 @@
Subproject commit 0dbdfd43ffb8e90a3657ed040fd1fb3d25654b51

111
examples/notmnist.hs Normal file
View File

@ -0,0 +1,111 @@
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Main where
import Sibe
import Numeric.LinearAlgebra
import Data.List
import Debug.Trace
import System.IO
import System.Directory
import Codec.Picture
import Codec.Picture.Types
import qualified Data.Vector.Storable as V
import Data.Either
import System.Random
import System.Random.Shuffle
import Data.Default.Class
import qualified Graphics.Rendering.Chart.Easy as Chart
import Graphics.Rendering.Chart.Backend.Cairo
main = do
setStdGen (mkStdGen 100)
let a = (sigmoid, sigmoid')
o = (softmax, one)
rnetwork = randomNetwork 0 (-1, 1) (28*28) [(100, a)] (10, a)
(inputs, labels) <- dataset
let trp = length inputs * 70 `div` 100
tep = length inputs * 30 `div` 100
-- training data
trinputs = take trp inputs
trlabels = take trp labels
-- test data
teinputs = take tep . drop trp $ inputs
telabels = take tep . drop trp $ labels
let session = def { learningRate = 0.5
, batchSize = 32
, epochs = 35
, network = rnetwork
, training = zip trinputs trlabels
, test = zip teinputs telabels
} :: Session
let initialCost = crossEntropy session
newsession <- run (sgd . learningRateDecay (1.1, 5e-2)) session
let el = map (\(e, l, _) -> (e, l)) (chart newsession)
ea = map (\(e, _, a) -> (e, a)) (chart newsession)
toFile Chart.def "notmnist.png" $ do
Chart.layoutlr_title Chart..= "loss over time"
Chart.plotLeft (Chart.line "loss" [el])
Chart.plotRight (Chart.line "learningRate" [ea])
let cost = crossEntropy newsession
putStrLn "parameters: "
putStrLn $ "- batch size: " ++ show (batchSize session)
putStrLn $ "- learning rate: " ++ show (learningRate session)
putStrLn $ "- epochs: " ++ show (epochs session)
putStrLn $ "- initial cost (cross-entropy): " ++ show initialCost
putStrLn "results: "
putStrLn $ "- accuracy: " ++ show (accuracy newsession)
putStrLn $ "- cost (cross-entropy): " ++ show cost
dataset :: IO ([Vector Double], [Vector Double])
dataset = do
let dir = "examples/notMNIST/"
groups <- filter ((/= '.') . head) <$> listDirectory dir
inputFiles <- mapM (listDirectory . (dir ++)) groups
let n = 512 {-- minimum (map length inputFiles) --}
numbers = map (`div` n) [0..n * length groups - 1]
inputFilesFull = map (\(i, g) -> map ((dir ++ i ++ "/") ++) g) (zip groups inputFiles)
inputImages <- mapM (mapM readImage . take n) inputFilesFull
let names = map (take n) inputFilesFull
let (l, r) = partitionEithers $ concat inputImages
inputs = map (fromPixels . convertRGB8) r
labels = map (\i -> V.replicate i 0 `V.snoc` 1 V.++ V.replicate (9 - i) 0) numbers
pairs = zip inputs labels
shuffled <- shuffleM pairs
return (map fst shuffled, map snd shuffled)
where
fromPixels :: Image PixelRGB8 -> Vector Double
fromPixels img@Image { .. } =
let pairs = [(x, y) | x <- [0..imageWidth - 1], y <- [0..imageHeight - 1]]
in V.fromList $ map iter pairs
where
iter (x, y) =
let (PixelRGB8 r g b) = convertPixel $ pixelAt img x y
in
if r == 0 && g == 0 && b == 0 then 0 else 1

View File

View File

@ -1,5 +0,0 @@
XSym
0040
3666c4cacaf995ebd11ef25aab70de99
../../sibe-repos/sentiment-analysis-data

35
examples/sin.hs Normal file
View File

@ -0,0 +1,35 @@
module Main where
import Sibe
import Numeric.LinearAlgebra
import Data.List
import Debug.Trace
main = do
let alpha = 0.5
epochs = 1000
a = (sigmoid, sigmoid')
lo = (sigmoid, (\_ -> 1)) -- cross entropy
-- a = (relu, relu')
rnetwork = randomNetwork 0 (-1, 1) 1 [(50, a)] (1, lo)
inputs = map (\a -> vector [a]) (reverse [0, 30, 45, 60, 90])
labels = map (\deg -> vector $ [sin $ deg * pi/180]) (reverse [0, 30, 45, 60, 90])
initial_cost = zipWith crossEntropy (map (`forward` rnetwork) inputs) labels
network <- run session inputs rnetwork labels alpha epochs
let results = map (`forward` network) inputs
rounded = map (map round . toList) results
cost = zipWith crossEntropy (map (`forward` network) inputs) labels
putStrLn "parameters: "
putStrLn $ "- inputs: " ++ show inputs
putStrLn $ "- labels: " ++ show labels
putStrLn $ "- learning rate: " ++ show alpha
putStrLn $ "- epochs: " ++ show epochs
{-putStrLn $ "- initial cost (cross-entropy): " ++ show initial_cost-}
putStrLn "results: "
putStrLn $ "- actual result: " ++ show results
{-putStrLn $ "- cost (cross-entropy): " ++ show cost-}

View File

@ -3,30 +3,37 @@ module Main where
import Numeric.LinearAlgebra import Numeric.LinearAlgebra
import Data.List import Data.List
import Debug.Trace import Debug.Trace
import Data.Default.Class
main = do main = do
let learning_rate = 0.5 let a = (sigmoid, sigmoid')
(iterations, epochs) = (2, 1000) rnetwork = randomNetwork 0 (-1, 1) 2 [(2, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output
a = (sigmoid, sigmoid')
rnetwork = randomNetwork 0 2 [(8, a)] (1, a) -- two inputs, 8 nodes in a single hidden layer, 1 output
inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]] inputs = [vector [0, 1], vector [1, 0], vector [1, 1], vector [0, 0]]
labels = [vector [1], vector [1], vector [0], vector [0]] labels = [vector [1], vector [1], vector [0], vector [0]]
initial_cost = zipWith crossEntropy (map (`forward` rnetwork) inputs) labels session = def { network = rnetwork
, learningRate = 0.5
, epochs = 1000
, training = zip inputs labels
, test = zip inputs labels
} :: Session
network = session inputs rnetwork labels learning_rate (iterations, epochs) initialCost = crossEntropy session
results = map (`forward` network) inputs
newsession <- run gd session
let results = map (`forward` newsession) inputs
rounded = map (map round . toList) results rounded = map (map round . toList) results
cost = zipWith crossEntropy (map (`forward` network) inputs) labels cost = crossEntropy newsession
putStrLn "parameters: " putStrLn "parameters: "
putStrLn $ "- inputs: " ++ show inputs putStrLn $ "- inputs: " ++ show inputs
putStrLn $ "- labels: " ++ show labels putStrLn $ "- labels: " ++ show labels
putStrLn $ "- learning rate: " ++ show learning_rate putStrLn $ "- learning rate: " ++ show (learningRate session)
putStrLn $ "- iterations/epochs: " ++ show (iterations, epochs) putStrLn $ "- epochs: " ++ show (epochs session)
putStrLn $ "- initial cost (cross-entropy): " ++ show initial_cost putStrLn $ "- initial cost (cross-entropy): " ++ show initialCost
putStrLn "results: " putStrLn "results: "
putStrLn $ "- actual result: " ++ show results putStrLn $ "- actual result: " ++ show results
putStrLn $ "- rounded result: " ++ show rounded putStrLn $ "- rounded result: " ++ show rounded

BIN
notmnist-0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

BIN
notmnist-1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

BIN
notmnist-2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

BIN
notmnist.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

BIN
sgd.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

View File

@ -15,7 +15,7 @@ cabal-version: >=1.10
library library
hs-source-dirs: src hs-source-dirs: src
exposed-modules: Sibe, Sibe.NaiveBayes exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, hmatrix , hmatrix
, random , random
@ -26,16 +26,21 @@ library
, regex-pcre , regex-pcre
, text , text
, stemmer , stemmer
, vector
, random-shuffle
, data-default-class
, Chart
, Chart-cairo
default-language: Haskell2010 default-language: Haskell2010
executable sibe-exe --executable sibe-exe
hs-source-dirs: app --hs-source-dirs: app
main-is: Main.hs --main-is: Main.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N --ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base --build-depends: base
, sibe --, sibe
, hmatrix --, hmatrix
default-language: Haskell2010 --default-language: Haskell2010
executable example-xor executable example-xor
hs-source-dirs: examples hs-source-dirs: examples
@ -44,6 +49,43 @@ executable example-xor
build-depends: base build-depends: base
, sibe , sibe
, hmatrix , hmatrix
, data-default-class
default-language: Haskell2010
--executable example-sin
--hs-source-dirs: examples
--main-is: sin.hs
--ghc-options: -threaded -rtsopts -with-rtsopts=-N
--build-depends: base
--, sibe
--, hmatrix
--default-language: Haskell2010
executable example-424
hs-source-dirs: examples
main-is: 424encoder.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base
, sibe
, hmatrix
, data-default-class
default-language: Haskell2010
executable example-notmnist
hs-source-dirs: examples
main-is: notmnist.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base
, sibe
, hmatrix
, directory >= 1.2.5.0
, JuicyPixels == 3.2.7.2
, vector == 0.11.0.0
, random
, random-shuffle
, data-default-class
, Chart
, Chart-cairo
default-language: Haskell2010 default-language: Haskell2010
executable example-naivebayes-doc-classifier executable example-naivebayes-doc-classifier
@ -57,19 +99,6 @@ executable example-naivebayes-doc-classifier
, split , split
default-language: Haskell2010 default-language: Haskell2010
executable example-naivebayes-sentiment-analysis
hs-source-dirs: examples
main-is: naivebayes-sentiment-analysis.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base
, sibe
, hmatrix
, containers
, split
, directory
, deepseq
default-language: Haskell2010
test-suite sibe-test test-suite sibe-test
type: exitcode-stdio-1.0 type: exitcode-stdio-1.0
hs-source-dirs: test hs-source-dirs: test

View File

@ -15,22 +15,37 @@ module Sibe
saveNetwork, saveNetwork,
loadNetwork, loadNetwork,
train, train,
session, gd,
shuffle, sgd,
run,
sigmoid, sigmoid,
sigmoid', sigmoid',
softmax,
softmax',
one,
relu, relu,
relu', relu',
crossEntropy, crossEntropy,
genSeed, genSeed,
replaceVector replaceVector,
Session(..),
accuracy,
learningRateDecay
) where ) where
import Numeric.LinearAlgebra import Numeric.LinearAlgebra
import System.Random import System.Random
import System.Random.Shuffle
import Debug.Trace import Debug.Trace
import Data.List (foldl', sortBy) import Data.List (foldl', sortBy, genericLength, permutations)
import System.IO import System.IO
import Control.DeepSeq import Control.DeepSeq
import Control.Monad
import qualified Data.Vector.Storable as V
import Data.Default.Class
import System.Exit
import qualified Graphics.Rendering.Chart.Easy as Chart
import Graphics.Rendering.Chart.Backend.Cairo
type LearningRate = Double type LearningRate = Double
type Input = Vector Double type Input = Vector Double
@ -48,8 +63,33 @@ module Sibe
data Network = O Layer data Network = O Layer
| Layer :- Network | Layer :- Network
deriving (Show) deriving (Show)
infixr 5 :- infixr 5 :-
data Session = Session { network :: Network
, training :: [(Vector Double, Vector Double)]
, test :: [(Vector Double, Vector Double)]
, learningRate :: Double
, epochs :: Int
, epoch :: Int
, batchSize :: Int
, chart :: [(Int, Double, Double)]
, momentum :: Double
}
emptyNetwork = randomNetwork 0 (0, 0) 0 [] (0, (id, id))
instance Default Session where
def = Session { network = seq (die "You have not specified a network parameter") emptyNetwork
, training = seq (die "You have not specified training data") []
, test = seq (die "You have not specified test data") []
, learningRate = 0.5
, epochs = 35
, epoch = 0
, batchSize = 0
, chart = []
, momentum = 0
}
saveNetwork :: Network -> String -> IO () saveNetwork :: Network -> String -> IO ()
saveNetwork network file = saveNetwork network file =
writeFile file ((show . reverse) (gen network [])) writeFile file ((show . reverse) (gen network []))
@ -73,22 +113,24 @@ module Sibe
runLayer :: Input -> Layer -> Output runLayer :: Input -> Layer -> Output
runLayer input (Layer !biases !weights _) = input <# weights + biases runLayer input (Layer !biases !weights _) = input <# weights + biases
forward :: Input -> Network -> Output forward :: Input -> Session -> Output
forward input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l forward input session = compute input (network session)
forward input (l@(Layer _ _ (fn, _)) :- n) = forward ((fst . activation $ l) $ runLayer input l) n where
compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l
compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer input l) n
randomLayer :: Seed -> (Int, Int) -> Activation -> Layer randomLayer :: Seed -> (Int, Int) -> (Double, Double) -> Activation -> Layer
randomLayer seed (wr, wc) = randomLayer seed (wr, wc) (l, u) =
let weights = uniformSample seed wr $ replicate wc (-1, 1) let weights = uniformSample seed wr $ replicate wc (l, u)
biases = randomVector seed Uniform wc * 2 - 1 biases = randomVector seed Uniform wc * realToFrac u - realToFrac l
in Layer biases weights in Layer biases weights
randomNetwork :: Seed -> Int -> [(Int, Activation)] -> (Int, Activation) -> Network randomNetwork :: Seed -> (Double, Double) -> Int -> [(Int, Activation)] -> (Int, Activation) -> Network
randomNetwork seed input [] (output, a) = randomNetwork seed bound input [] (output, a) =
O $ randomLayer seed (input, output) a O $ randomLayer seed (input, output) bound a
randomNetwork seed input ((h, a):hs) output = randomNetwork seed bound input ((h, a):hs) output =
randomLayer seed (input, h) a :- randomLayer seed (input, h) bound a :-
randomNetwork (seed + 1) h hs output randomNetwork (seed + 1) bound h hs output
sigmoid :: Vector Double -> Vector Double sigmoid :: Vector Double -> Vector Double
sigmoid x = 1 / max (1 + exp (-x)) 1e-10 sigmoid x = 1 / max (1 + exp (-x)) 1e-10
@ -96,18 +138,37 @@ module Sibe
sigmoid' :: Vector Double -> Vector Double sigmoid' :: Vector Double -> Vector Double
sigmoid' x = sigmoid x * (1 - sigmoid x) sigmoid' x = sigmoid x * (1 - sigmoid x)
softmax :: Vector Double -> Vector Double
softmax x = cmap (\a -> exp a / s) x
where
s = V.sum $ exp x
one :: a -> Double
one x = 1
softmax' :: Vector Double -> Vector Double
softmax' x = softmax x * (1 - softmax x)
relu :: Vector Double -> Vector Double relu :: Vector Double -> Vector Double
relu x = log (max (1 + exp x) 1e-10) relu = cmap (max 0.1)
relu' :: Vector Double -> Vector Double relu' :: Vector Double -> Vector Double
relu' = sigmoid relu' = cmap dev
where dev x
| x < 0 = 0
| otherwise = 1
crossEntropy :: Output -> Output -> Double crossEntropy :: Session -> Double
crossEntropy output target = crossEntropy session =
let pairs = zip (toList output) (toList target) let inputs = map fst (test session)
n = fromIntegral (length pairs) labels = map (toList . snd) (test session)
in (-1 / n) * sum (map f pairs) outputs = map (toList . (`forward` session)) inputs
pairs = zip outputs labels
n = genericLength pairs
in sum (map set pairs) / n
where where
set (os, ls) = (-1 / genericLength os) * sum (zipWith (curry f) os ls)
f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10) f (a, y) = y * log (max 1e-10 a) + (1 - y) * log (max (1 - a) 1e-10)
train :: Input train :: Input
@ -138,35 +199,137 @@ module Sibe
o = fn y o = fn y
(n', delta) = run o n (n', delta) = run o n
de = delta * fn' y -- quadratic cost de = delta * fn' y
biases' = biases - scale alpha de biases' = biases - cmap (*alpha) de
weights' = weights - scale alpha (input `outer` de) weights' = weights - cmap (*alpha) (input `outer` de)
layer = Layer biases' weights' (fn, fn') layer = Layer biases' weights' (fn, fn')
pass = weights #> de pass = weights #> de
-- pass = weights #> de -- pass = weights #> de
in (layer :- n', pass) in (layer :- n', pass)
session :: [Input] -> Network -> [Output] -> Double -> (Int, Int) -> Network {-trainMomentum :: Input
session inputs network labels alpha (iterations, epochs) = -> Network
let n = length inputs -> Output -- target
indexes = shuffle n (map (`mod` n) [0..n * epochs]) -> Double -- learning rate
in foldl' iter network indexes -> (Double, Double) -- momentum
-> Network -- network's output
trainMomentum input network target alpha (m, v) = fst $ run input network
where where
iter net i = run :: Input -> Network -> (Network, Vector Double)
let n = length inputs run input (O l@(Layer biases weights (fn, fn'))) =
index = i `mod` n let y = runLayer input l
input = inputs !! index o = fn y
label = labels !! index delta = o - target
in foldl' (\net _ -> train input net label alpha) net [0..iterations] de = delta * fn' y
v =
-- de = delta -- cross entropy cost
shuffle :: Seed -> [a] -> [a] biases' = biases - scale alpha de
shuffle seed list = weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
let ords = map ord $ take (length list) (randomRs (0, 1) (mkStdGen seed) :: [Int]) layer = Layer biases' weights' (fn, fn') -- updated layer
in map snd $ sortBy (\x y -> fst x) (zip ords list)
where ord x | x == 0 = LT pass = weights #> de
| x == 1 = GT -- pass = weights #> de
in (O layer, pass)
run input (l@(Layer biases weights (fn, fn')) :- n) =
let y = runLayer input l
o = fn y
(n', delta) = run o n
de = delta * fn' y
biases' = biases - cmap (*alpha) de
weights' = weights - cmap (*alpha) (input `outer` de)
layer = Layer biases' weights' (fn, fn')
pass = weights #> de
-- pass = weights #> de
in (layer :- n', pass)-}
gd :: Session -> IO Session
gd session = do
seed <- newStdGen
let pairs = training session
alpha = learningRate session
net = network session
let n = length pairs
shuffled <- shuffleM pairs
let newnet = foldl' (\n (input, label) -> train input n label alpha) net pairs
return session { network = newnet
, epoch = epoch session + 1
}
sgd :: Session -> IO Session
sgd session = do
seed <- newStdGen
let pairs = training session
bsize = batchSize session
alpha = learningRate session
net = network session
let n = length pairs
iterations = n `div` bsize - 1
shuffled <- shuffleM pairs
let iter net i =
let n = length pairs
batch = take bsize . drop (i * bsize) $ shuffled
batchInputs = map fst batch
batchLabels = map snd batch
batchPair = zip batchInputs batchLabels
in foldl' (\n (input, label) -> train input n label alpha) net batchPair
let newnet = foldl' iter net [0..iterations]
cost = crossEntropy (session { network = newnet })
let el = map (\(e, l, _) -> (e, l)) (chart session)
ea = map (\(e, _, a) -> (e, a)) (chart session)
putStrLn $ (show $ epoch session) ++ " => " ++ (show cost) ++ " @ " ++ (show $ learningRate session)
toFile Chart.def "sgd.png" $ do
Chart.layoutlr_title Chart..= "loss over time"
Chart.plotLeft (Chart.line "loss" [el])
Chart.plotRight (Chart.line "learningRate" [ea])
return session { network = newnet
, epoch = epoch session + 1
, chart = (epoch session, cost, learningRate session):chart session
}
accuracy :: Session -> Double
accuracy session =
let inputs = map fst (test session)
labels = map snd (test session)
results = map (`forward` session) inputs
rounded = map (map round . toList) results
equals = zipWith (==) rounded (map (map round . toList) labels)
in genericLength (filter (== True) equals) / genericLength inputs
learningRateDecay :: (Double, Double) -> Session -> Session
learningRateDecay (step, m) session =
session { learningRate = max m $ learningRate session / step }
run :: (Session -> IO Session)
-> Session -> IO Session
run fn session = foldM (\s i -> fn s) session [0..epochs session]
factorial :: Int -> Int
factorial 0 = 1
factorial x = x * factorial (x - 1)
genSeed :: IO Seed genSeed :: IO Seed
genSeed = do genSeed = do
@ -176,12 +339,7 @@ module Sibe
replaceVector :: Vector Double -> Int -> Double -> Vector Double replaceVector :: Vector Double -> Int -> Double -> Vector Double
replaceVector vec index value = replaceVector vec index value =
let list = toList vec let list = toList vec
in fromList $ rrow index list in fromList $ take index list ++ value : drop (index + 1) list
where
rrow index [] = []
rrow index (x:xs)
| index == index = value:xs
| otherwise = x : rrow (index + 1) xs
clip :: Double -> (Double, Double) -> Double clip :: Double -> (Double, Double) -> Double
clip x (l, u) = min u (max l x) clip x (l, u) = min u (max l x)

View File

129
src/Sibe/NLP.hs Normal file
View File

@ -0,0 +1,129 @@
module Sibe.NLP
(Class,
Document(..),
ordNub,
accuracy,
recall,
precision,
fmeasure,
cleanText,
cleanDocuments,
removeWords,
removeStopwords,
ngram,
ngramText,
)
where
import Data.List
import Debug.Trace
import qualified Data.Set as Set
import Data.List.Split
import Data.Maybe
import Control.Arrow ((&&&))
import Text.Regex.PCRE
import Data.Char (isSpace, isNumber, toLower)
import NLP.Stemmer
type Class = Int;
data Document = Document { text :: String
, c :: Class
} deriving (Eq, Show, Read)
cleanText :: String -> String
cleanText string =
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
stemmed = unwords $ map (stem Porter) (words spacify)
nonumber = filter (not . isNumber) stemmed
lower = map toLower nonumber
in (unwords . words) lower -- remove unnecessary spaces
where
trim = f . f
where
f = reverse . dropWhile isSpace
replace needle replacement =
map (\c -> if c == needle then replacement else c)
cleanDocuments :: [Document] -> [Document]
cleanDocuments documents =
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
in cleaned
removeWords :: [String] -> [Document] -> [Document]
removeWords ws documents =
map (\(Document text c) -> Document (rm ws text) c) documents
where
rm list text =
unwords $ filter (`notElem` list) (words text)
removeStopwords :: Int -> [Document] -> [Document]
removeStopwords i documents =
let wc = wordCounts (concatDocs documents)
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
stopwords = map fst (take i wlist)
in removeWords stopwords documents
where
vocabulary x = ordNub (words x)
countWordInDoc d w = genericLength (filter (==w) d)
wordCounts x =
let voc = vocabulary x
in zip voc $ map (countWordInDoc (words x)) voc
concatDocs = concatMap (\(Document text _) -> text ++ " ")
ordNub :: (Ord a) => [a] -> [a]
ordNub = go Set.empty
where
go _ [] = []
go s (x:xs) = if x `Set.member` s then go s xs
else x : go (Set.insert x s) xs
accuracy :: [(Int, (Int, Double))] -> Double
accuracy results =
let pairs = map (\(a, b) -> (a, fst b)) results
correct = filter (uncurry (==)) pairs
in genericLength correct / genericLength results
recall :: [(Int, (Int, Double))] -> Double
recall results =
let classes = ordNub (map fst results)
s = sum (map rec classes) / genericLength classes
in s
where
rec a =
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
y = genericLength $ filter (\(c, (r, _)) -> c == a) results
in t / y
precision :: [(Int, (Int, Double))] -> Double
precision results =
let classes = ordNub (map fst results)
s = sum (map prec classes) / genericLength classes
in s
where
prec a =
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
y = genericLength $ filter (\(c, (r, _)) -> r == a) results
in
if y == 0
then 0
else t / y
fmeasure :: [(Int, (Int, Double))] -> Double
fmeasure results =
let r = recall results
p = precision results
in (2 * p * r) / (p + r)
ngram :: Int -> [Document] -> [Document]
ngram n documents =
map (\(Document text c) -> Document (ngramText n text) c) documents
ngramText :: Int -> String -> String
ngramText n text =
let ws = words text
pairs = zip [0..] ws
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
in unwords ("<b>_":grams)

View File

@ -1,7 +1,7 @@
module Sibe.NaiveBayes module Sibe.NaiveBayes
(Document(..), (Document(..),
NB(..), NB(..),
train, initialize,
run, run,
session, session,
ordNub, ordNub,
@ -19,21 +19,13 @@ module Sibe.NaiveBayes
removeStopwords, removeStopwords,
) )
where where
import Sibe.NLP
import Data.List import Data.List
import Debug.Trace import Debug.Trace
import qualified Data.Set as Set import qualified Data.Set as Set
import Data.List.Split import Data.List.Split
import Data.Maybe import Data.Maybe
import Control.Arrow ((&&&)) import Control.Arrow ((&&&))
import Text.Regex.PCRE
import Data.Char (isSpace, isNumber, toLower)
import NLP.Stemmer
type Class = Int;
data Document = Document { text :: String
, c :: Class
} deriving (Eq, Show, Read)
data NB = NB { documents :: [Document] data NB = NB { documents :: [Document]
, classes :: [(Class, Double)] , classes :: [(Class, Double)]
@ -44,8 +36,8 @@ module Sibe.NaiveBayes
, cgram :: [(Class, [(String, Int)])] , cgram :: [(Class, [(String, Int)])]
} deriving (Eq, Show, Read) } deriving (Eq, Show, Read)
train :: [Document] -> [Class] -> NB initialize :: [Document] -> [Class] -> NB
train documents classes = initialize documents classes =
let megadoc = concatDocs documents let megadoc = concatDocs documents
vocabulary = genericLength ((ordNub . words) megadoc) vocabulary = genericLength ((ordNub . words) megadoc)
-- (class, prior probability) -- (class, prior probability)
@ -83,17 +75,6 @@ module Sibe.NaiveBayes
classWordsCounts x = wordsCount (classWords x) (classVocabulary x) classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x) classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
ngram :: Int -> [Document] -> [Document]
ngram n documents =
map (\(Document text c) -> Document (ngramText n text) c) documents
ngramText :: Int -> String -> String
ngramText n text =
let ws = words text
pairs = zip [0..] ws
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
in unwords ("<b>_":grams)
session :: [Document] -> NB -> [(Class, (Class, Double))] session :: [Document] -> NB -> [(Class, (Class, Double))]
session docs nb = session docs nb =
let results = map (\(Document text c) -> (c, run text nb)) docs let results = map (\(Document text c) -> (c, run text nb)) docs
@ -143,91 +124,5 @@ module Sibe.NaiveBayes
variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1) variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
in sqrt variance in sqrt variance
cleanText :: String -> String
cleanText string =
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
stemmed = unwords $ map (stem Porter) (words spacify)
nonumber = filter (not . isNumber) stemmed
lower = map toLower nonumber
in (unwords . words) lower -- remove unnecessary spaces
where
trim = f . f
where
f = reverse . dropWhile isSpace
replace needle replacement =
map (\c -> if c == needle then replacement else c)
cleanDocuments :: [Document] -> [Document]
cleanDocuments documents =
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
in cleaned
removeWords :: [String] -> [Document] -> [Document]
removeWords ws documents =
map (\(Document text c) -> Document (rm ws text) c) documents
where
rm list text =
unwords $ filter (`notElem` list) (words text)
removeStopwords :: Int -> [Document] -> [Document]
removeStopwords i documents =
let wc = wordCounts (concatDocs documents)
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
stopwords = map fst (take i wlist)
in removeWords stopwords documents
where
vocabulary x = ordNub (words x)
countWordInDoc d w = genericLength (filter (==w) d)
wordCounts x =
let voc = vocabulary x
in zip voc $ map (countWordInDoc (words x)) voc
concatDocs = concatMap (\(Document text _) -> text ++ " ")
l :: (Show a) => a -> a l :: (Show a) => a -> a
l a = trace (show a) a l a = trace (show a) a
ordNub :: (Ord a) => [a] -> [a]
ordNub = go Set.empty
where
go _ [] = []
go s (x:xs) = if x `Set.member` s then go s xs
else x : go (Set.insert x s) xs
accuracy :: [(Int, (Int, Double))] -> Double
accuracy results =
let pairs = map (\(a, b) -> (a, fst b)) results
correct = filter (uncurry (==)) pairs
in genericLength correct / genericLength results
recall :: [(Int, (Int, Double))] -> Double
recall results =
let classes = ordNub (map fst results)
s = sum (map rec classes) / genericLength classes
in s
where
rec a =
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
y = genericLength $ filter (\(c, (r, _)) -> c == a) results
in t / y
precision :: [(Int, (Int, Double))] -> Double
precision results =
let classes = ordNub (map fst results)
s = sum (map prec classes) / genericLength classes
in s
where
prec a =
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
y = genericLength $ filter (\(c, (r, _)) -> r == a) results
in
if y == 0
then 0
else t / y
fmeasure :: [(Int, (Int, Double))] -> Double
fmeasure results =
let r = recall results
p = precision results
in (2 * p * r) / (p + r)

View File

@ -1,40 +1,5 @@
# This file was automatically generated by 'stack init'
#
# Some commonly used options have been documented as comments in this file.
# For advanced use and comprehensive documentation of the format, please see:
# http://docs.haskellstack.org/en/stable/yaml_configuration/
# Resolver to choose a 'specific' stackage snapshot or a compiler version.
# A snapshot resolver dictates the compiler version and the set of packages
# to be used for project dependencies. For example:
#
# resolver: lts-3.5
# resolver: nightly-2015-09-21
# resolver: ghc-7.10.2
# resolver: ghcjs-0.1.0_ghc-7.10.2
# resolver:
# name: custom-snapshot
# location: "./custom-snapshot.yaml"
resolver: lts-6.7 resolver: lts-6.7
# User packages to be built.
# Various formats can be used as shown in the example below.
#
# packages:
# - some-directory
# - https://example.com/foo/bar/baz-0.0.2.tar.gz
# - location:
# git: https://github.com/commercialhaskell/stack.git
# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a
# - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a
# extra-dep: true
# subdirs:
# - auto-update
# - wai
#
# A package marked 'extra-dep: true' will only be built if demanded by a
# non-dependency (i.e. a user package), and its test suites and benchmarks
# will not be run. This is useful for tweaking upstream packages.
packages: packages:
- location: - location:
git: git@github.com:albertoruiz/hmatrix.git git: git@github.com:albertoruiz/hmatrix.git
@ -42,36 +7,11 @@ packages:
subdirs: subdirs:
- packages/base - packages/base
- . - .
- http://hackage.haskell.org/package/containers-0.5.7.1/containers-0.5.7.1.tar.gz
- http://hackage.haskell.org/package/text-1.2.2.1/text-1.2.2.1.tar.gz
- http://hackage.haskell.org/package/stemmer-0.5.2/stemmer-0.5.2.tar.gz
# Dependency packages to be pulled from upstream that are not in the resolver extra-deps:
# (e.g., acme-missiles-0.3) - directory-1.2.7.0
extra-deps: [] - text-1.2.2.1
- stemmer-0.5.2
# Override default flag values for local packages and extra-deps - containers-0.5.7.1
flags: {} - Chart-1.8
- Chart-cairo-1.8
# Extra package databases containing global packages
extra-package-dbs: []
# Control whether we use the GHC we find on the path
# system-ghc: true
#
# Require a specific version of stack, using version ranges
# require-stack-version: -any # Default
# require-stack-version: ">=1.1"
#
# Override the architecture used by stack, especially useful on Windows
# arch: i386
# arch: x86_64
#
# Extra directories used by stack for building
# extra-include-dirs: [/path/to/dir]
# extra-lib-dirs: [/path/to/dir]
#
# Allow a newer minor version of GHC than the snapshot specifies
# compiler-check: newer-minor
system-ghc: false