diff --git a/README.md b/README.md
index 6a87d73..dafd963 100644
--- a/README.md
+++ b/README.md
@@ -60,5 +60,8 @@ stack exec example-naivebayes-doc-classifier -- --verbose --top-ten
 
 ### notMNIST
 
-notMNIST dataset, cross-entropy loss, learning rate decay and sgd ([`notmnist.hs`](https://github.com/mdibaiee/sibe/blob/master/examples/notmnist.hs)):
+notMNIST dataset, sigmoid hidden layer, cross-entropy loss, learning rate decay and sgd ([`notmnist.hs`](https://github.com/mdibaiee/sibe/blob/master/examples/notmnist.hs)):
+![notMNIST](https://github.com/mdibaiee/sibe/blob/master/notmnist.png?raw=true)
+
+notMNIST dataset, relu hidden layer, cross-entropy loss, learning rate decay and sgd ([`notmnist.hs`](https://github.com/mdibaiee/sibe/blob/master/examples/notmnist.hs)):
 ![notMNIST](https://github.com/mdibaiee/sibe/blob/master/notmnist.png?raw=true)
diff --git a/examples/notmnist.hs b/examples/notmnist.hs
index 2628f16..22701db 100644
--- a/examples/notmnist.hs
+++ b/examples/notmnist.hs
@@ -43,7 +43,7 @@ module Main where
 
     let session = def { learningRate = 0.5
                       , batchSize    = 32
-                      , epochs       = 10
+                      , epochs       = 9
                       , network      = rnetwork
                       , training     = zip trinputs trlabels
                       , test         = zip teinputs telabels
diff --git a/examples/word2vec.hs b/examples/word2vec.hs
new file mode 100644
index 0000000..99c92b5
--- /dev/null
+++ b/examples/word2vec.hs
@@ -0,0 +1,34 @@
+{-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+module Main where
+  import Sibe
+  import Sibe.Word2Vec
+  import Sibe.Utils
+  import Data.Default.Class
+  import qualified Data.Vector.Storable as V
+  import Data.List (sortBy)
+  import Data.Function (on)
+  import Numeric.LinearAlgebra
+  import System.IO
+  import Data.List.Split
+
+  main = do
+    {-ds <- do
+        content <- readFile "examples/doc-classifier-data/data-reuters"
+        let splitted = splitOn (replicate 10 '-' ++ "\n") content
+            d = concatMap (tail . lines) (take 100 splitted)
+        return d-}
+    let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
+
+    let session = def { learningRate = 0.8
+                      , batchSize = 10
+                      , epochs = 1000
+                      } :: Session
+        w2v = def { docs = ds }:: Word2Vec
+
+
+    r <- word2vec w2v session
+    {-print r-}
+    return ()
diff --git a/notmnist-relu.png b/notmnist-relu.png
new file mode 100644
index 0000000..d60c0a3
Binary files /dev/null and b/notmnist-relu.png differ
diff --git a/notmnist.png b/notmnist.png
index ff73904..24d24b7 100644
Binary files a/notmnist.png and b/notmnist.png differ
diff --git a/sgd.png b/sgd.png
index 1f69105..0cba4b5 100644
Binary files a/sgd.png and b/sgd.png differ
diff --git a/sibe.cabal b/sibe.cabal
index 876c55b..b6b4218 100644
--- a/sibe.cabal
+++ b/sibe.cabal
@@ -15,7 +15,7 @@ cabal-version:       >=1.10
 
 library
   hs-source-dirs:      src
-  exposed-modules:     Sibe, Sibe.NaiveBayes, Sibe.NLP
+  exposed-modules:     Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
   build-depends:       base >= 4.7 && < 5
                      , hmatrix
                      , random
@@ -52,6 +52,18 @@ executable example-xor
                      , data-default-class
   default-language:    Haskell2010
 
+executable example-word2vec
+  hs-source-dirs:      examples
+  main-is:             word2vec.hs
+  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
+  build-depends:       base
+                     , sibe
+                     , hmatrix
+                     , data-default-class
+                     , split
+                     , vector
+  default-language:    Haskell2010
+
 --executable example-sin
   --hs-source-dirs:      examples
   --main-is:             sin.hs
diff --git a/src/Sibe.hs b/src/Sibe.hs
index 521ba79..371b9ae 100644
--- a/src/Sibe.hs
+++ b/src/Sibe.hs
@@ -10,8 +10,10 @@ module Sibe
      Output,
      Activation,
      forward,
+     runLayer,
      randomLayer,
      randomNetwork,
+     buildNetwork,
      saveNetwork,
      loadNetwork,
      train,
@@ -30,7 +32,9 @@ module Sibe
      replaceVector,
      Session(..),
      accuracy,
-     learningRateDecay
+     learningRateDecay,
+     ignoreBiases,
+     one
     ) where
       import Numeric.LinearAlgebra
       import System.Random
@@ -62,7 +66,12 @@ module Sibe
 
       data Network = O Layer
                    | Layer :- Network
-                   deriving (Show)
+      
+      instance Show Network where
+        show (Layer biases nodes _ :- n) =
+          (show . length $ toLists nodes) ++ "x" ++ (show . length . head . toLists $ nodes) ++ " " ++ (show . length . toList $ biases) ++ " :- " ++ show n
+        show (O (Layer biases nodes _)) =
+          (show . length $ toLists nodes) ++ "x" ++ (show . length . head . toLists $ nodes) ++ " " ++ (show . length . toList $ biases)
                    
       infixr 5 :-
 
@@ -75,7 +84,7 @@ module Sibe
                               , batchSize    :: Int
                               , chart        :: [(Int, Double, Double)]
                               , momentum     :: Double
-                              }
+                              } deriving (Show)
 
       emptyNetwork = randomNetwork 0 (0, 0) 0 [] (0, (id, id))
       instance Default Session where
@@ -132,6 +141,13 @@ module Sibe
         randomLayer seed (input, h) bound a :-
         randomNetwork (seed + 1) bound h hs output
 
+      buildNetwork :: Seed -> (Double, Double) -> Int -> [(Int, Int, Activation)] -> (Int, Int, Activation) -> Network
+      buildNetwork seed bound input [] (outputRows, outputColumns, a) =
+        O $ randomLayer seed (input, outputColumns) bound a
+      buildNetwork seed bound input ((rows, columns, a):hs) output =
+        randomLayer seed (input, columns) bound a :-
+        buildNetwork (seed + 1) bound columns hs output
+
       sigmoid :: Vector Double -> Vector Double
       sigmoid x = 1 / max (1 + exp (-x)) 1e-10
 
@@ -172,6 +188,9 @@ module Sibe
       crossEntropy' :: Vector Double -> Vector Double
       crossEntropy' x = 1 / fromIntegral (V.length x)
 
+      one :: Vector Double -> Vector Double
+      one v = vector $ replicate (V.length v) 1
+
       train :: Input
             -> Network
             -> Output -- target
@@ -210,46 +229,6 @@ module Sibe
                 -- pass = weights #> de
             in (layer :- n', pass)
 
-      {-trainMomentum :: Input
-                    -> Network
-                    -> Output -- target
-                    -> Double -- learning rate
-                    -> (Double, Double) -- momentum
-                    -> Network -- network's output
-      trainMomentum input network target alpha (m, v) = fst $ run input network
-        where
-          run :: Input -> Network -> (Network, Vector Double)
-          run input (O l@(Layer biases weights (fn, fn'))) =
-            let y = runLayer input l
-                o = fn y
-                delta = o - target
-                de = delta * fn' y
-                v = 
-                -- de = delta -- cross entropy cost
-
-                biases'  = biases  - scale alpha de
-                weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
-                layer    = Layer biases' weights' (fn, fn') -- updated layer
-
-                pass = weights #> de
-                -- pass = weights #> de
-
-            in (O layer, pass)
-          run input (l@(Layer biases weights (fn, fn')) :- n) =
-            let y = runLayer input l
-                o = fn y
-                (n', delta) = run o n
-
-                de = delta * fn' y
-
-                biases'  = biases  - cmap (*alpha) de
-                weights' = weights - cmap (*alpha) (input `outer` de)
-                layer = Layer biases' weights' (fn, fn')
-
-                pass = weights #> de
-                -- pass = weights #> de
-            in (layer :- n', pass)-}
-
       gd :: Session -> IO Session
       gd session = do
         seed <- newStdGen
@@ -324,6 +303,13 @@ module Sibe
       learningRateDecay (step, m) session =
         session { learningRate = max m $ learningRate session / step }
 
+      ignoreBiases :: Session -> Session
+      ignoreBiases session =
+        session { network = rmbias (network session) }
+        where
+          rmbias (O (Layer nodes biases a)) = O $ Layer nodes (biases * 0) a
+          rmbias ((Layer nodes biases a) :- n) = Layer nodes (biases * 0) a :- rmbias n
+
       run :: (Session -> IO Session)
           ->  Session -> IO Session
       run fn session = foldM (\s i -> fn s) session [0..epochs session]
diff --git a/src/Sibe/NLP.hs b/src/Sibe/NLP.hs
index 29e63c5..3369a7c 100644
--- a/src/Sibe/NLP.hs
+++ b/src/Sibe/NLP.hs
@@ -1,7 +1,6 @@
 module Sibe.NLP
   (Class,
    Document(..),
-   ordNub,
    accuracy,
    recall,
    precision,
@@ -14,15 +13,16 @@ module Sibe.NLP
    ngramText,
   )
   where
+    import Sibe.Utils
     import Data.List
     import Debug.Trace
-    import qualified Data.Set as Set
     import Data.List.Split
     import Data.Maybe
     import Control.Arrow ((&&&))
     import Text.Regex.PCRE
     import Data.Char (isSpace, isNumber, toLower)
     import NLP.Stemmer
+    import qualified Data.Set as Set
 
     type Class = Int;
 
@@ -73,13 +73,6 @@ module Sibe.NLP
 
         concatDocs = concatMap (\(Document text _) -> text ++ " ")
 
-    ordNub :: (Ord a) => [a] -> [a]
-    ordNub = go Set.empty
-      where
-        go _ [] = []
-        go s (x:xs) = if x `Set.member` s then go s xs
-                                          else x : go (Set.insert x s) xs
-
     accuracy :: [(Int, (Int, Double))] -> Double
     accuracy results =
       let pairs = map (\(a, b) -> (a, fst b)) results
diff --git a/src/Sibe/NaiveBayes.hs b/src/Sibe/NaiveBayes.hs
index 85657c4..c9b24d1 100644
--- a/src/Sibe/NaiveBayes.hs
+++ b/src/Sibe/NaiveBayes.hs
@@ -4,7 +4,6 @@ module Sibe.NaiveBayes
    initialize,
    run,
    session,
-   ordNub,
    accuracy,
    precision,
    recall,
@@ -19,6 +18,7 @@ module Sibe.NaiveBayes
    removeStopwords,
   )
   where
+    import Sibe.Utils
     import Sibe.NLP
     import Data.List
     import Debug.Trace
diff --git a/src/Sibe/Utils.hs b/src/Sibe/Utils.hs
new file mode 100644
index 0000000..ea3fd24
--- /dev/null
+++ b/src/Sibe/Utils.hs
@@ -0,0 +1,24 @@
+module Sibe.Utils
+  (similarity,
+   ordNub,
+   onehot
+  ) where
+    import qualified Data.Vector.Storable as V
+    import qualified Data.Set as Set
+    import Numeric.LinearAlgebra
+
+    similarity :: Vector Double -> Vector Double -> Double
+    similarity a b = (V.sum $ a * b) / (magnitude a * magnitude b)
+      where
+        magnitude :: Vector Double -> Double
+        magnitude v = sqrt $ V.sum (cmap (^2) v)
+
+    onehot :: Int -> Int -> Vector Double
+    onehot len i = vector $ replicate i 0 ++ [1] ++ replicate (len - i - 1) 0
+
+    ordNub :: (Ord a) => [a] -> [a]
+    ordNub = go Set.empty
+      where
+        go _ [] = []
+        go s (x:xs) = if x `Set.member` s then go s xs
+                                          else x : go (Set.insert x s) xs
diff --git a/src/Sibe/Word2Vec.hs b/src/Sibe/Word2Vec.hs
new file mode 100644
index 0000000..d43876a
--- /dev/null
+++ b/src/Sibe/Word2Vec.hs
@@ -0,0 +1,103 @@
+module Sibe.Word2Vec
+  (word2vec,
+   Word2Vec (..)
+  ) where
+    import Sibe
+    import Sibe.NLP
+    import Sibe.Utils
+    import Debug.Trace
+    import Data.Char
+    import Data.Maybe
+    import Data.List
+    import Numeric.LinearAlgebra hiding (find)
+    import qualified Data.Vector.Storable as V
+    import Data.Default.Class
+    import Data.Function (on)
+
+    data Word2Vec = Word2Vec { docs :: [String]
+                             , window :: Int
+                             }
+    instance Default Word2Vec where
+      def = Word2Vec { docs = []
+                     , window = 2
+                     }
+
+    word2vec w2v session = do
+      return trainingData
+      let s = session { training = trainingData
+                      , network = buildNetwork 0 (-1, 1) v [(v, 25, (id, one))] (20, v, (softmax, crossEntropy'))
+                      }
+      print trainingData
+      newses <- run (gd . learningRateDecay (1.1, 0.1) . ignoreBiases) s
+
+      let (hidden@(Layer biases nodes _) :- _) = network newses
+      {-let computedVocVec = map (\(w, v) -> (w, forward v newses)) vocvec-}
+      let computedVocVec = map (\(w, v) -> (w, (fromRows [v]) <> nodes)) vocvec
+      print computedVocVec
+
+      {-mapM_ (\(w, v) -> do
+                      putStr $ w ++ ": "
+                      let similarities = map (similarity v . snd) computedVocVec
+                      let sorted = sortBy (compare `on` similarity v . snd) computedVocVec
+                      {-print $ zip (map fst sorted) similarities-}
+                      print . take 2 . drop 1 . reverse $ map fst sorted
+            ) computedVocVec-}
+
+      return newses
+      where
+        ws = words (concatMap ((++ " <start> ") . map toLower) (docs w2v))
+        vocabulary = ordNub ws
+        v = length vocabulary
+
+        cooccurence = foldl' iter [] (zip [0..] ws)
+          where
+            iter acc (i, w) =
+              let a = findIndex ((== w) . fst) acc
+                  before = take (window w2v) . drop (i - window w2v) $ ws
+                  after = take (window w2v) . drop (i + 1) $ ws
+                  ns = if i == 0 then after else before ++ after
+              in
+                if isJust a then
+                  let idx = fromJust a
+                      new = foldl (\acc n -> add acc n) (snd $ acc !! idx) ns
+                  in take idx acc ++ [(w, new)] ++ drop (idx + 1) acc
+                else
+                  acc ++ [(w, map (\n -> (n, 1)) ns)]
+
+            add [] n = [(n, 1)]
+            add ((hw, hc):hs) n
+              | n == hw = (hw, hc + 1):hs
+              | otherwise = (hw, hc):add hs n
+
+        vocvec = zip vocabulary $ map (onehot v) [0..v - 1]
+        {-trainingData = map iter cooccurence
+          where
+            iter (w, targets) =
+              let ts = map (\(w, c) -> c * (snd . fromJust $ find ((== w) . fst) vocvec)) targets
+                  folded = foldl (+) (vector $ replicate v 0) ts
+                  input = snd . fromJust $ find ((== w) . fst) vocvec
+              in (input, folded)-}
+        trainingData = map iter $ zip [window w2v..length vocvec - window w2v] vocvec
+          where
+            iter (i, (w, v)) =
+              let before = take (window w2v) . drop (i - window w2v) $ vocvec
+                  after = take (window w2v) . drop (i + 1) $ vocvec
+                  ns = map snd $ before ++ after
+                  new = foldl1 (+) ns
+              in (v, new)
+
+            add [] n = [(n, 1)]
+            add ((hw, hc):hs) n
+              | n == hw = (hw, hc + 1):hs
+              | otherwise = (hw, hc):add hs n
+
+        wordfrequency = foldl' iter [] ws
+          where
+            iter acc w =
+              let i = findIndex ((== w) . fst) acc
+              in
+                if isJust i then
+                  let idx = fromJust i
+                  in take idx acc ++ [(w, snd (acc !! idx) + 1)] ++ drop (idx + 1) acc
+                else
+                  acc ++ [(w, 1)]
diff --git a/src/Sibe/Word2Vec.hs.backup b/src/Sibe/Word2Vec.hs.backup
new file mode 100644
index 0000000..fa7a3d2
--- /dev/null
+++ b/src/Sibe/Word2Vec.hs.backup
@@ -0,0 +1,44 @@
+module Sibe.Word2Vec
+  (word2vec,
+   mapTuple
+  ) where
+    import Sibe
+    import Sibe.NLP
+    import Debug.Trace
+    import Data.Char
+    import Data.Maybe
+    import Data.List
+    import Numeric.LinearAlgebra hiding (find)
+    import qualified Data.Vector.Storable as V
+
+    word2vec docs session = do
+      let cooccurence = concat $ map co docs
+          a = (sigmoid, sigmoid')
+          o = (softmax, crossEntropy')
+          window = 2
+          s = session { training = cooccurence
+                      , test     = cooccurence
+                      , network  = buildNetwork 0 (-1, 1) n [(n, 300, (id, id))] (300, n, (softmax, crossEntropy'))
+                      }
+      print $ network s
+      newses <- run gd s
+      return (newses, cooccurence, vocabulary, vocvec)
+      where
+        n = length vocabulary
+        vocabulary = ordNub . words . map toLower . concatMap (++ " ") $ docs
+        vocvec = zip vocabulary $ map tovec [0..n]
+        tovec i = replicate i 0 ++ [1] ++ replicate (n - i - 1) 0
+        co d =
+          let p = pairs d
+          in map (\(a, [b, c]) -> (f a, V.concat [f b, f c])) p
+          where 
+            f w = vector . snd . fromJust $ find ((== w) . fst) vocvec
+        pairs d = concatMap iter [0..length ws]
+          where 
+            ws = words $ map toLower d
+            iter i
+              | i > 0 && i < length ws - 1 = [(ws !! i, [ws !! (i - 1), ws !! (i + 1)])]
+              | otherwise = []
+
+    mapTuple :: (a -> b) -> (a, a) -> (b, b)
+    mapTuple f (a, b) = (f a, f b)