feat(Numeric): move all modules to Numeric

This commit is contained in:
Mahdi Dibaiee
2016-10-17 01:54:35 +03:30
parent 506b180498
commit ed6d2b3021
16 changed files with 20 additions and 65 deletions

362
src/Numeric/Sibe.hs Normal file
View File

@ -0,0 +1,362 @@
{-# LANGUAGE GADTs #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE TypeOperators #-}
module Numeric.Sibe
(Network(..),
Layer(..),
Input,
Output,
Activation,
forward,
forward',
runLayer,
runLayer',
randomLayer,
randomNetwork,
buildNetwork,
saveNetwork,
loadNetwork,
train,
gd,
sgd,
run,
sigmoid,
sigmoid',
softmax,
softmax',
relu,
relu',
crossEntropy,
crossEntropy',
genSeed,
replaceVector,
Session(..),
accuracy,
learningRateDecay,
ignoreBiases,
one
) where
import Numeric.LinearAlgebra
import System.Random
import System.Random.Shuffle
import Debug.Trace
import Data.List (foldl', sortBy, genericLength, permutations)
import System.IO
import Control.DeepSeq
import Control.Monad
import qualified Data.Vector.Storable as V
import Data.Default.Class
import System.Exit
import qualified Graphics.Rendering.Chart.Easy as Chart
import Graphics.Rendering.Chart.Backend.Cairo
type LearningRate = Double
type Input = Vector Double
type Output = Vector Double
type Activation = (Vector Double -> Vector Double, Vector Double -> Vector Double)
data Layer = Layer { biases :: !(Vector Double)
, nodes :: !(Matrix Double)
, activation :: Activation
}
instance Show Layer where
show (Layer biases nodes _) = "(" ++ show biases ++ "," ++ show nodes ++ ")"
data Network = O Layer
| Layer :- Network
instance Show Network where
show (Layer biases nodes _ :- n) =
(show . length $ toLists nodes) ++ "x" ++ (show . length . head . toLists $ nodes) ++ " " ++ (show . length . toList $ biases) ++ " :- " ++ show n
show (O (Layer biases nodes _)) =
(show . length $ toLists nodes) ++ "x" ++ (show . length . head . toLists $ nodes) ++ " " ++ (show . length . toList $ biases)
infixr 5 :-
data Session = Session { network :: Network
, training :: [(Vector Double, Vector Double)]
, test :: [(Vector Double, Vector Double)]
, learningRate :: Double
, epochs :: Int
, epoch :: Int
, batchSize :: Int
, chart :: [(Int, Double, Double)]
, drawChart :: Bool
, chartName :: String
, momentum :: Double
, debug :: Bool
} deriving (Show)
emptyNetwork = randomNetwork 0 (0, 0) 0 [] (0, (id, id))
instance Default Session where
def = Session { network = seq (die "You have not specified a network parameter") emptyNetwork
, training = seq (die "You have not specified training data") []
, test = seq (die "You have not specified test data") []
, learningRate = 0.5
, epochs = 35
, epoch = 0
, batchSize = 0
, chart = []
, drawChart = False
, chartName = "chart.png"
, momentum = 0
, debug = False
}
saveNetwork :: Network -> String -> IO ()
saveNetwork network file =
writeFile file ((show . reverse) (gen network []))
where
gen (O (Layer biases nodes _)) list = (biases, nodes) : list
gen (Layer biases nodes _ :- n) list = gen n $ (biases, nodes) : list
loadNetwork :: [Activation] -> String -> IO Network
loadNetwork activations file = do
handle <- openFile file ReadMode
content <- hGetContents handle
let list = read content :: [(Vector Double, Matrix Double)]
network = gen list activations
content `deepseq` hClose handle
return network
where
gen [(biases, nodes)] [a] = O (Layer biases nodes a)
gen ((biases, nodes):hs) (a:as) = Layer biases nodes a :- gen hs as
runLayer :: Input -> Layer -> Output
runLayer input (Layer !biases !weights _) = input <# weights + biases
runLayer' :: Input -> Layer -> Output
runLayer' input (Layer !biases !weights _) = input <# weights
forward :: Input -> Session -> Output
forward input session = compute input (network session)
where
compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l
compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer input l) n
forward' :: Input -> Session -> Output
forward' input session = compute input (network session)
where
compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer' input l
compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer' input l) n
randomLayer :: Seed -> (Int, Int) -> (Double, Double) -> Activation -> Layer
randomLayer seed (wr, wc) (l, u) =
let weights = uniformSample seed wr $ replicate wc (l, u)
biases = randomVector seed Uniform wc * realToFrac u - realToFrac l
in Layer biases weights
randomNetwork :: Seed -> (Double, Double) -> Int -> [(Int, Activation)] -> (Int, Activation) -> Network
randomNetwork seed bound input [] (output, a) =
O $ randomLayer seed (input, output) bound a
randomNetwork seed bound input ((h, a):hs) output =
randomLayer seed (input, h) bound a :-
randomNetwork (seed + 1) bound h hs output
buildNetwork :: Seed -> (Double, Double) -> Int -> [(Int, Int, Activation)] -> (Int, Int, Activation) -> Network
buildNetwork seed bound input [] (outputRows, outputColumns, a) =
O $ randomLayer seed (input, outputColumns) bound a
buildNetwork seed bound input ((rows, columns, a):hs) output =
randomLayer seed (input, columns) bound a :-
buildNetwork (seed + 1) bound columns hs output
sigmoid :: Vector Double -> Vector Double
sigmoid x = 1 / max (1 + exp (-x)) 1e-10
sigmoid' :: Vector Double -> Vector Double
sigmoid' x = sigmoid x * (1 - sigmoid x)
softmax :: Vector Double -> Vector Double
softmax x = cmap (\a -> exp a / s) x
where
s = V.sum $ exp x
softmax' :: Vector Double -> Vector Double
softmax' = cmap (\a -> sig a * (1 - sig a))
where
sig x = 1 / max (1 + exp (-x)) 1e-10
-- used for negative sampling
{-sampledSoftmax :: Vector Double -> Vector Double-}
{-sampledSoftmax x = cmap (\a -> exp a / s) x-}
{-where-}
{-s = V.sum . exp $ x-}
relu :: Vector Double -> Vector Double
relu = cmap (max 0.1)
relu' :: Vector Double -> Vector Double
relu' = cmap dev
where dev x
| x < 0 = 0
| otherwise = 1
crossEntropy :: Session -> Double
crossEntropy session =
let inputs = map fst (test session)
labels = map (toList . snd) (test session)
outputs = map (toList . (`forward` session)) inputs
pairs = zip outputs labels
n = genericLength pairs
in sum (map set pairs) / n
where
set (os, ls) = (-1 / genericLength os) * sum (zipWith f os ls)
f a y = y * log (max 1e-10 a)
crossEntropy' :: Vector Double -> Vector Double
crossEntropy' x = 1 / fromIntegral (V.length x)
one :: Vector Double -> Vector Double
one v = vector $ replicate (V.length v) 1
train :: Input
-> Network
-> Output -- target
-> Double -- learning rate
-> Network -- network's output
train input network target alpha = fst $ run input network
where
run :: Input -> Network -> (Network, Vector Double)
run input (O l@(Layer biases weights (fn, fn'))) =
let y = runLayer input l
o = fn y
delta = o - target
de = delta * fn' y
biases' = biases - scale alpha de
weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
layer = Layer biases' weights' (fn, fn') -- updated layer
pass = weights #> de
in (O layer, pass)
run input (l@(Layer biases weights (fn, fn')) :- n) =
let y = runLayer input l
o = fn y
(n', delta) = run o n
de = delta * fn' y
biases' = biases - scale alpha de
weights' = weights - scale alpha (input `outer` de)
layer = Layer biases' weights' (fn, fn')
pass = weights #> de
in (layer :- n', pass)
gd :: Session -> IO Session
gd session = do
seed <- newStdGen
let pairs = training session
alpha = learningRate session
net = network session
let n = length pairs
shuffled <- shuffleM pairs
let newnet = foldl' (\n (input, label) -> train input n label alpha) net pairs
cost = crossEntropy (session { network = newnet })
let el = map (\(e, l, _) -> (e, l)) (chart session)
ea = map (\(e, _, a) -> (e, a)) (chart session)
when (drawChart session) $ do
toFile Chart.def (chartName session) $ do
Chart.layoutlr_title Chart..= "loss over time"
Chart.plotLeft (Chart.line "loss" [el])
Chart.plotRight (Chart.line "learningRate" [ea])
return session { network = newnet
, epoch = epoch session + 1
, chart = (epoch session, cost, learningRate session):chart session
}
sgd :: Session -> IO Session
sgd session = do
seed <- newStdGen
let pairs = training session
bsize = batchSize session
alpha = learningRate session
net = network session
let n = length pairs
iterations = n `div` bsize - 1
shuffled <- shuffleM pairs
let iter net i =
let n = length pairs
batch = take bsize . drop (i * bsize) $ shuffled
batchInputs = map fst batch
batchLabels = map snd batch
batchPair = zip batchInputs batchLabels
in foldl' (\n (input, label) -> train input n label alpha) net batchPair
let newnet = foldl' iter net [0..iterations]
cost = crossEntropy (session { network = newnet })
let el = map (\(e, l, _) -> (e, l)) (chart session)
ea = map (\(e, _, a) -> (e, a)) (chart session)
when (drawChart session) $ do
toFile Chart.def (chartName session) $ do
Chart.layoutlr_title Chart..= "loss over time"
Chart.plotLeft (Chart.line "loss" [el])
Chart.plotRight (Chart.line "learningRate" [ea])
return session { network = newnet
, epoch = epoch session + 1
, chart = (epoch session, cost, learningRate session):chart session
}
accuracy :: Session -> Double
accuracy session =
let inputs = map fst (test session)
labels = map snd (test session)
results = map (`forward` session) inputs
rounded = map (map round . toList) results
equals = zipWith (==) rounded (map (map round . toList) labels)
in genericLength (filter (== True) equals) / genericLength inputs
learningRateDecay :: (Double, Double) -> Session -> Session
learningRateDecay (step, m) session =
session { learningRate = max m $ learningRate session / step }
ignoreBiases :: Session -> Session
ignoreBiases session =
session { network = rmbias (network session) }
where
rmbias (O (Layer biases nodes a)) = O $ Layer (biases * 0) nodes a
rmbias ((Layer biases nodes a) :- n) = Layer (biases * 0) nodes a :- rmbias n
run :: (Session -> IO Session)
-> Session -> IO Session
run fn session = foldM (\s i -> fn s) session [0..epochs session]
factorial :: Int -> Int
factorial 0 = 1
factorial x = x * factorial (x - 1)
genSeed :: IO Seed
genSeed = do
(seed, _) <- random <$> newStdGen :: IO (Int, StdGen)
return seed
replaceVector :: Vector Double -> Int -> Double -> Vector Double
replaceVector vec index value =
let list = toList vec
in fromList $ take index list ++ value : drop (index + 1) list
clip :: Double -> (Double, Double) -> Double
clip x (l, u) = min u (max l x)

122
src/Numeric/Sibe/NLP.hs Normal file
View File

@ -0,0 +1,122 @@
module Numeric.Sibe.NLP
(Class,
Document(..),
accuracy,
recall,
precision,
fmeasure,
cleanText,
cleanDocuments,
removeWords,
removeStopwords,
ngram,
ngramText,
)
where
import Numeric.Sibe.Utils
import Data.List
import Debug.Trace
import Data.List.Split
import Data.Maybe
import Control.Arrow ((&&&))
import Text.Regex.PCRE
import Data.Char (isSpace, isNumber, toLower)
import NLP.Stemmer
import qualified Data.Set as Set
type Class = Int;
data Document = Document { text :: String
, c :: Class
} deriving (Eq, Show, Read)
cleanText :: String -> String
cleanText string =
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?']) (trim string)
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
stemmed = unwords $ map (stem Porter) (words spacify)
nonumber = filter (not . isNumber) stemmed
lower = map toLower nonumber
in (unwords . words) lower -- remove unnecessary spaces
where
trim = f . f
where
f = reverse . dropWhile isSpace
replace needle replacement =
map (\c -> if c == needle then replacement else c)
cleanDocuments :: [Document] -> [Document]
cleanDocuments documents =
let cleaned = map (\(Document text c) -> Document (cleanText text) c) documents
in cleaned
removeWords :: [String] -> [Document] -> [Document]
removeWords ws documents =
map (\(Document text c) -> Document (rm ws text) c) documents
where
rm list text =
unwords $ filter (`notElem` list) (words text)
removeStopwords :: Int -> [Document] -> [Document]
removeStopwords i documents =
let wc = wordCounts (concatDocs documents)
wlist = sortBy (\(_, a) (_, b) -> b `compare` a) wc
stopwords = map fst (take i wlist)
in removeWords stopwords documents
where
vocabulary x = ordNub (words x)
countWordInDoc d w = genericLength (filter (==w) d)
wordCounts x =
let voc = vocabulary x
in zip voc $ map (countWordInDoc (words x)) voc
concatDocs = concatMap (\(Document text _) -> text ++ " ")
accuracy :: [(Int, (Int, Double))] -> Double
accuracy results =
let pairs = map (\(a, b) -> (a, fst b)) results
correct = filter (uncurry (==)) pairs
in genericLength correct / genericLength results
recall :: [(Int, (Int, Double))] -> Double
recall results =
let classes = ordNub (map fst results)
s = sum (map rec classes) / genericLength classes
in s
where
rec a =
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
y = genericLength $ filter (\(c, (r, _)) -> c == a) results
in t / y
precision :: [(Int, (Int, Double))] -> Double
precision results =
let classes = ordNub (map fst results)
s = sum (map prec classes) / genericLength classes
in s
where
prec a =
let t = genericLength $ filter (\(c, (r, _)) -> c == r && c == a) results
y = genericLength $ filter (\(c, (r, _)) -> r == a) results
in
if y == 0
then 0
else t / y
fmeasure :: [(Int, (Int, Double))] -> Double
fmeasure results =
let r = recall results
p = precision results
in (2 * p * r) / (p + r)
ngram :: Int -> [Document] -> [Document]
ngram n documents =
map (\(Document text c) -> Document (ngramText n text) c) documents
ngramText :: Int -> String -> String
ngramText n text =
let ws = words text
pairs = zip [0..] ws
grams = map (\(i, w) -> concat . intersperse "_" $ w:((take (n - 1) . drop (i+1)) ws)) pairs
in unwords ("<b>_":grams)

View File

@ -0,0 +1,128 @@
module Numeric.Sibe.NaiveBayes
(Document(..),
NB(..),
initialize,
run,
session,
accuracy,
precision,
recall,
fmeasure,
mean,
stdev,
cleanText,
cleanDocuments,
ngram,
ngramText,
removeWords,
removeStopwords,
)
where
import Numeric.Sibe.Utils
import Numeric.Sibe.NLP
import Data.List
import Debug.Trace
import qualified Data.Set as Set
import Data.List.Split
import Data.Maybe
import Control.Arrow ((&&&))
data NB = NB { documents :: [Document]
, classes :: [(Class, Double)]
, vocabulary :: Int
, megadoc :: String
, cd :: [(Class, [Document])]
, cw :: [(Class, [(String, Int)])]
, cgram :: [(Class, [(String, Int)])]
} deriving (Eq, Show, Read)
initialize :: [Document] -> [Class] -> NB
initialize documents classes =
let megadoc = concatDocs documents
vocabulary = genericLength ((ordNub . words) megadoc)
-- (class, prior probability)
cls = zip classes (map classPrior classes)
-- (class, [document])
cd = zip classes (map classDocs classes)
-- (class, [(word, count)])
cw = zip classes $ map classWordsCounts classes
cgram = zip classes $ map classNGramCounts classes
in NB { documents = documents
, classes = cls
, vocabulary = vocabulary
, megadoc = megadoc
, cd = cd
, cw = cw
, cgram = cgram
}
where
concatDocs = concatMap (\(Document text _) -> text ++ " ")
classDocs x = filter ((==x) . c) documents
classMegadoc = concatMap (\(Document text _) -> text ++ " ") . classDocs
classWords = words . classMegadoc
classNGram = concatMap (\(Document text _) -> text ++ " ") . ngram 2 . classDocs
classNGramWords = words . classNGram
classVocabulary = ordNub . classWords
classPrior x = genericLength (classDocs x) / genericLength documents
countWordInDoc d w = genericLength (filter (==w) d)
wordsCount ws voc =
zip voc $ map (countWordInDoc ws) voc
classWordsCounts x = wordsCount (classWords x) (classVocabulary x)
classNGramCounts x = wordsCount (classNGramWords x) (ordNub $ classNGramWords x)
session :: [Document] -> NB -> [(Class, (Class, Double))]
session docs nb =
let results = map (\(Document text c) -> (c, run text nb)) docs
in results
run :: String -> NB -> (Class, Double)
run txt (NB documents classes vocabulary megadoc cd cw cgram) =
let scores = map (score . fst) classes
index = argmax scores
m = maximum scores
in (fst (classes !! index), m)
where
score c =
let prior = snd (fromJust $ find ((==c) . fst) classes)
-- below is the formula according to Multinominal Naive Bayes, but it seems
-- using a uniform prior probability seems to work better when working with imbalanced
-- training datasets, instead, we help rare classes get higher scores using
-- alpha = (1 - prior * ALPHA), we use ALPHA = 1 here
-- in prior * product (map (prob c) (words txt))
alpha = 1 - prior
in alpha * product (map (prob c) (words txt))
prob c w =
let fcw = fromJust $ find ((==c) . fst) cw
fcg = fromJust $ find ((==c) . fst) cgram
tctM = find ((== w) . fst) (snd fcw)
tct = if isJust tctM then (snd . fromJust) tctM else 0
cvoc = sum $ map snd (snd fcw)
voc = vocabulary
gram = find ((==w) . last . splitOn "_" . fst) (snd fcg)
pg = if isJust gram then (snd . fromJust) gram else 0
-- in realToFrac (tct * pg + 1) / realToFrac (cvoc + voc) -- uncomment to enable ngrams
in realToFrac (tct + 1) / realToFrac (cvoc + voc)
argmax :: (Ord a) => [a] -> Int
argmax x = fst $ maximumBy (\(_, a) (_, b) -> a `compare` b) (zip [0..] x)
mean :: [Double] -> Double
mean x = sum x / genericLength x
stdev :: [Double] -> Double
stdev x =
let avg = mean x
variance = sum (map ((^2) . subtract avg) x) / (genericLength x - 1)
in sqrt variance
l :: (Show a) => a -> a
l a = trace (show a) a

41
src/Numeric/Sibe/Utils.hs Normal file
View File

@ -0,0 +1,41 @@
module Numeric.Sibe.Utils
( similarity
, ordNub
, onehot
, average
, pca
) where
import qualified Data.Vector.Storable as V
import qualified Data.Set as Set
import Numeric.LinearAlgebra
similarity :: Vector Double -> Vector Double -> Double
similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
where
magnitude :: Vector Double -> Double
magnitude v = sqrt $ V.sum (cmap (^2) v)
onehot :: Int -> Int -> Vector Double
onehot len i = vector $ replicate i 0 ++ [1] ++ replicate (len - i - 1) 0
ordNub :: (Ord a) => [a] -> [a]
ordNub = go Set.empty
where
go _ [] = []
go s (x:xs) = if x `Set.member` s then go s xs
else x : go (Set.insert x s) xs
average :: Vector Double -> Vector Double
average v = cmap (/ (V.sum v)) v
pca :: Matrix Double -> Int -> Matrix Double
pca m d =
let rs = toRows m
means = map (\v -> V.sum v / fromIntegral (V.length v)) rs
meanReduced = map (\(a, b) -> V.map (+ (negate b)) a) $ zip rs means
mat = fromRows meanReduced
(u, s, v) = svd mat
diagS = diagRect 0 s (rows mat) (cols mat)
in u ?? (All, Take d) <> diagS ?? (Take d, Take d)

View File

@ -0,0 +1,129 @@
{-# LANGUAGE ScopedTypeVariables #-}
module Numeric.Sibe.Word2Vec
( word2vec
, Word2Vec (..)
, W2VMethod (..)
) where
import Numeric.Sibe
import Numeric.Sibe.Utils
import Debug.Trace
import Data.Char
import Data.Maybe
import Data.List
import Numeric.LinearAlgebra as H hiding (find)
import qualified Data.Vector.Storable as V
import Data.Default.Class
import Data.Function (on)
import Control.Monad
import System.Random
import Graphics.Rendering.Chart as Chart
import Graphics.Rendering.Chart.Backend.Cairo
import Control.Lens
data W2VMethod = SkipGram | CBOW
data Word2Vec = Word2Vec { docs :: [String]
, window :: Int
, dimensions :: Int
, method :: W2VMethod
, w2vChartName :: String
, w2vDrawChart :: Bool
}
instance Default Word2Vec where
def = Word2Vec { docs = []
, window = 2
, w2vChartName = "w2v.png"
, w2vDrawChart = False
}
word2vec w2v session = do
seed <- newStdGen
let s = session { training = trainingData
, network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, crossEntropy'))
}
when (debug s) $ do
putStr "vocabulary size: "
print v
putStr "trainingData length: "
print . length $ trainingData
-- biases are not used in skipgram/cbow
newses <- run (sgd . ignoreBiases) s
-- export the hidden layer
let (hidden@(Layer biases nodes _) :- _) = network newses
-- run words through the hidden layer alone to get the word vector
let computedVocVec = map (\(w, v) -> (w, runLayer' v hidden)) vocvec
when (w2vDrawChart w2v) $ do
let m = fromRows . map snd $ computedVocVec
twoDimensions = pca m 2
textData = zipWith (\s l -> (V.head l, V.last l, s)) (map fst computedVocVec) (toRows twoDimensions)
chart = toRenderable layout
where
textP = plot_annotation_values .~ textData
$ def
layout = layout_title .~ "word vectors"
$ layout_plots .~ [toPlot textP]
$ def
renderableToFile def (w2vChartName w2v) chart
return ()
return (computedVocVec, vocvec)
where
-- clean documents
ds = map cleanText (docs w2v)
-- words of each document
wd = map (words . (++ " ") . (map toLower)) ds
-- all words together, used to generate the vocabulary
ws = words (concatMap ((++ " ") . map toLower) ds)
vocabulary = ordNub ws
v = length vocabulary
-- generate one-hot vectors for each word of vocabulary
vocvec = zip vocabulary $ map (onehot v) [0..v - 1]
-- training data: generate input and output pairs for each word and the words in it's window
trainingData = concatMap (\wds -> concatMap (iter wds) $ zip [0..] wds) wd
where
iter wds (i, w) =
let v = snd . fromJust . find ((==w) . fst) $ vocvec
before = take (window w2v) . drop (i - window w2v) $ wds
after = take (window w2v) . drop (i + 1) $ wds
ns
| i == 0 = after
| i == length vocvec - 1 = before
| otherwise = before ++ after
vectorized = map (\w -> snd . fromJust $ find ((== w) . fst) vocvec) ns
new = foldl1 (+) vectorized
in
if length wds <= 1
then []
else
case method w2v of
SkipGram -> [(v, average new)]
CBOW -> [(average new, v)]
_ -> error "unsupported word2vec method"
cleanText :: String -> String
cleanText string =
let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?', '\'']) (trim string)
spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
nonumber = filter (not . isNumber) spacify
lower = map toLower nonumber
in (unwords . words) lower -- remove unnecessary spaces
where
trim = f . f
where
f = reverse . dropWhile isSpace
replace needle replacement =
map (\c -> if c == needle then replacement else c)