rm(sin): remove sin example

fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation
2016-09-16 13:31:23 +04:30
parent c0083f5c05
commit d4ac90bbd5
6 changed files with 1119 additions and 146 deletions
--- a/examples/sin.hs
+++ b/examples/sin.hs
@@ -1,35 +0,0 @@
-module Main where
-  import Sibe
-  import Numeric.LinearAlgebra
-  import Data.List
-  import Debug.Trace
-
-  main = do
-    let alpha = 0.5
-        epochs = 1000
-        a = (sigmoid, sigmoid')
-        lo = (sigmoid, (\_ -> 1)) -- cross entropy
-        -- a = (relu, relu')
-        rnetwork = randomNetwork 0 (-1, 1) 1 [(50, a)] (1, lo)
-
-        inputs = map (\a -> vector [a]) (reverse [0, 30, 45, 60, 90])
-        labels = map (\deg -> vector $ [sin $ deg * pi/180]) (reverse [0, 30, 45, 60, 90])
-
-        initial_cost = zipWith crossEntropy (map (`forward` rnetwork) inputs) labels
-
-    network <- run session inputs rnetwork labels alpha epochs
-
-    let results = map (`forward` network) inputs
-        rounded = map (map round . toList) results
-
-        cost = zipWith crossEntropy (map (`forward` network) inputs) labels
-
-    putStrLn "parameters: "
-    putStrLn $ "- inputs: " ++ show inputs
-    putStrLn $ "- labels: " ++ show labels
-    putStrLn $ "- learning rate: " ++ show alpha
-    putStrLn $ "- epochs: " ++ show epochs
-    {-putStrLn $ "- initial cost (cross-entropy): " ++ show initial_cost-}
-    putStrLn "results: "
-    putStrLn $ "- actual result: " ++ show results
-    {-putStrLn $ "- cost (cross-entropy): " ++ show cost-}
--- a/examples/word2vec.hs
+++ b/examples/word2vec.hs
@@ -15,20 +15,43 @@ module Main where
  import Data.List.Split

  main = do
-    {-ds <- do
-        content <- readFile "examples/doc-classifier-data/data-reuters"
-        let splitted = splitOn (replicate 10 '-' ++ "\n") content
-            d = concatMap (tail . lines) (take 100 splitted)
-        return d-}
-    let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
+    sws <- lines <$> readFile "examples/stopwords"
+    {-ds <- do-}
+        {-content <- readFile "examples/doc-classifier-data/data-reuters"-}
+        {-let splitted = splitOn (replicate 10 '-' ++ "\n") content-}
+            {-d = concatMap (tail . lines) (take 100 splitted)-}
+        {-return $ removeWords sws d-}
+    --let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
+    let ds = ["the king loves the queen", "the queen loves the king",
+              "the dwarf hates the king", "the queen hates the dwarf",
+              "the dwarf poisons the king", "the dwarf poisons the queen"]

-    let session = def { learningRate = 0.8
-                      , batchSize = 10
-                      , epochs = 1000
+    let session = def { learningRate = 0.1
+                      , batchSize = 16
+                      , epochs = 100
                      } :: Session
-        w2v = def { docs = ds }:: Word2Vec
+        w2v = def { docs = ds
+                  , dimensions = 50
+                  , method = SkipGram
+                  , window = 3
+                  } :: Word2Vec


-    r <- word2vec w2v session
-    {-print r-}
+    (computed, vocvec) <- word2vec w2v session
+    
+    mapM_ (\(w, v) -> do
+                    putStr $ w ++ ": "
+                    let similarities = map (similarity v . snd) computed
+                    let sorted = sortBy (compare `on` similarity v . snd) computed
+                    print . take 2 . drop 1 . reverse $ map fst sorted
+          ) computed
+
    return ()
+
+  removeWords :: [String] -> [String] -> [String]
+  removeWords ws documents =
+    map (rm ws) documents
+    where
+        rm list text =
+          unwords $ filter (`notElem` list) (words text)
+
--- a/1012
+++ b/1012
--- a/sibe.cabal
+++ b/sibe.cabal
@@ -33,15 +33,6 @@ library
                     , Chart-cairo
  default-language:    Haskell2010

--executable sibe-exe
-  --hs-source-dirs:      app
-  --main-is:             Main.hs
-  --ghc-options:         -threaded -rtsopts -with-rtsopts=-N
-  --build-depends:       base
-                     --, sibe
-                     --, hmatrix
-  --default-language:    Haskell2010
-
 executable example-xor
  hs-source-dirs:      examples
  main-is:             xor.hs
@@ -64,15 +55,6 @@ executable example-word2vec
                     , vector
  default-language:    Haskell2010

--executable example-sin
-  --hs-source-dirs:      examples
-  --main-is:             sin.hs
-  --ghc-options:         -threaded -rtsopts -with-rtsopts=-N
-  --build-depends:       base
-                     --, sibe
-                     --, hmatrix
-  --default-language:    Haskell2010
-
 executable example-424
  hs-source-dirs:      examples
  main-is:             424encoder.hs
--- a/src/Sibe.hs
+++ b/src/Sibe.hs
@@ -10,7 +10,9 @@ module Sibe
     Output,
     Activation,
     forward,
+     forward',
     runLayer,
+     runLayer',
     randomLayer,
     randomNetwork,
     buildNetwork,
@@ -84,7 +86,6 @@ module Sibe
                              , batchSize    :: Int
                              , chart        :: [(Int, Double, Double)]
                              , momentum     :: Double
-                              , biases       :: Bool
                              } deriving (Show)

      emptyNetwork = randomNetwork 0 (0, 0) 0 [] (0, (id, id))
@@ -98,7 +99,6 @@ module Sibe
                      , batchSize    = 0
                      , chart        = []
                      , momentum     = 0
-                      , biases       = True
                      }

      saveNetwork :: Network -> String -> IO ()
@@ -133,6 +133,12 @@ module Sibe
          compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l
          compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer input l) n

+      forward' :: Input -> Session -> Output
+      forward' input session = compute input (network session)
+        where
+          compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer' input l
+          compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer' input l) n
+
      randomLayer :: Seed -> (Int, Int) -> (Double, Double) -> Activation -> Layer
      randomLayer seed (wr, wc) (l, u) =
        let weights = uniformSample seed wr $ replicate wc (l, u)
@@ -209,14 +215,12 @@ module Sibe
                o = fn y
                delta = o - target 
                de = delta * fn' y
-                -- de = delta / fromIntegral (V.length o) -- cross entropy cost

                biases'  = biases  - scale alpha de
                weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
                layer    = Layer biases' weights' (fn, fn') -- updated layer

                pass = weights #> de
-                -- pass = weights #> de

            in (O layer, pass)
          run input (l@(Layer biases weights (fn, fn')) :- n) =
@@ -226,12 +230,11 @@ module Sibe

                de = delta * fn' y

-                biases'  = biases  - cmap (*alpha) de
-                weights' = weights - cmap (*alpha) (input `outer` de)
+                biases'  = biases  - scale alpha de
+                weights' = weights - scale alpha (input `outer` de)
                layer = Layer biases' weights' (fn, fn')

                pass = weights #> de
-                -- pass = weights #> de
            in (layer :- n', pass)

      gd :: Session -> IO Session
@@ -280,8 +283,6 @@ module Sibe
        let el = map (\(e, l, _) -> (e, l)) (chart session)
            ea = map (\(e, _, a) -> (e, a)) (chart session)

-        putStrLn $ (show $ epoch session) ++ " => " ++ (show cost) ++ " @ " ++ (show $ learningRate session)
-
        toFile Chart.def "sgd.png" $ do
          Chart.layoutlr_title Chart..= "loss over time"
          Chart.plotLeft (Chart.line "loss" [el])
@@ -312,8 +313,8 @@ module Sibe
      ignoreBiases session =
        session { network = rmbias (network session) }
        where
-          rmbias (O (Layer nodes biases a)) = O $ Layer nodes (biases * 0) a
-          rmbias ((Layer nodes biases a) :- n) = Layer nodes (biases * 0) a :- rmbias n
+          rmbias (O (Layer biases nodes a)) = O $ Layer (biases * 0) nodes a
+          rmbias ((Layer biases nodes a) :- n) = Layer (biases * 0) nodes a :- rmbias n

      run :: (Session -> IO Session)
          ->  Session -> IO Session
--- a/src/Sibe/Word2Vec.hs
+++ b/src/Sibe/Word2Vec.hs
@@ -1,9 +1,9 @@
 module Sibe.Word2Vec
-  (word2vec,
-   Word2Vec (..)
+  ( word2vec
+  , Word2Vec (..)
+  , W2VMethod (..)
  ) where
    import Sibe
-    import Sibe.NLP
    import Sibe.Utils
    import Debug.Trace
    import Data.Char
@@ -14,8 +14,11 @@ module Sibe.Word2Vec
    import Data.Default.Class
    import Data.Function (on)

+    data W2VMethod = SkipGram | CBOW
    data Word2Vec = Word2Vec { docs :: [String]
                             , window :: Int
+                             , dimensions :: Int
+                             , method :: W2VMethod
                             }
    instance Default Word2Vec where
      def = Word2Vec { docs = []
@@ -23,83 +26,70 @@ module Sibe.Word2Vec
                     }

    word2vec w2v session = do
-      return trainingData
      let s = session { training = trainingData
-                      , network = buildNetwork 0 (-1, 1) v [(v, 25, (id, one))] (20, v, (softmax, crossEntropy'))
-                      , biases = False
+                      , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one))
                      }
-      print trainingData
-      newses <- run (gd . learningRateDecay (1.1, 0.1)) s

+      putStr "vocabulary size: "
+      print v
+
+      putStr "trainingData length: "
+      print . length $ trainingData
+
+      -- biases are not used in skipgram/cbow
+      newses <- run (sgd . ignoreBiases) s
+
+
+      -- export the hidden layer
      let (hidden@(Layer biases nodes _) :- _) = network newses
-      {-let computedVocVec = map (\(w, v) -> (w, forward v newses)) vocvec-}
-      print biases
-      let computedVocVec = map (\(w, v) -> (w, v <# nodes)) vocvec
-      {-print computedVocVec-}
+      -- run words through the hidden layer alone to get the word vector
+      let computedVocVec = map (\(w, v) -> (w, runLayer' v hidden)) vocvec

-      {-mapM_ (\(w, v) -> do
-                      putStr $ w ++ ": "
-                      let similarities = map (similarity v . snd) computedVocVec
-                      let sorted = sortBy (compare `on` similarity v . snd) computedVocVec
-                      {-print $ zip (map fst sorted) similarities-}
-                      print . take 2 . drop 1 . reverse $ map fst sorted
-            ) computedVocVec-}
-
-      return newses
+      return (computedVocVec, vocvec)
      where
-        ws = words (concatMap ((++ " <start> ") . map toLower) (docs w2v))
+        -- clean documents
+        ds = map cleanText (docs w2v)
+
+        -- words of each document
+        wd = map (words . (++ " ") . (map toLower)) ds
+
+        -- all words together, used to generate the vocabulary
+        ws = words (concatMap ((++ " ") . map toLower) ds)
        vocabulary = ordNub ws
        v = length vocabulary

-        cooccurence = foldl' iter [] (zip [0..] ws)
-          where
-            iter acc (i, w) =
-              let a = findIndex ((== w) . fst) acc
-                  before = take (window w2v) . drop (i - window w2v) $ ws
-                  after = take (window w2v) . drop (i + 1) $ ws
-                  ns = if i == 0 then after else before ++ after
-              in
-                if isJust a then
-                  let idx = fromJust a
-                      new = foldl (\acc n -> add acc n) (snd $ acc !! idx) ns
-                  in take idx acc ++ [(w, new)] ++ drop (idx + 1) acc
-                else
-                  acc ++ [(w, map (\n -> (n, 1)) ns)]
-
-            add [] n = [(n, 1)]
-            add ((hw, hc):hs) n
-              | n == hw = (hw, hc + 1):hs
-              | otherwise = (hw, hc):add hs n
-
+        -- generate one-hot vectors for each word of vocabulary
        vocvec = zip vocabulary $ map (onehot v) [0..v - 1]
-        {-trainingData = map iter cooccurence
-          where
-            iter (w, targets) =
-              let ts = map (\(w, c) -> c * (snd . fromJust $ find ((== w) . fst) vocvec)) targets
-                  folded = foldl (+) (vector $ replicate v 0) ts
-                  input = snd . fromJust $ find ((== w) . fst) vocvec
-              in (input, folded)-}
-        trainingData = map iter $ zip [window w2v..length vocvec - window w2v] vocvec
-          where
-            iter (i, (w, v)) =
-              let before = take (window w2v) . drop (i - window w2v) $ vocvec
-                  after = take (window w2v) . drop (i + 1) $ vocvec
-                  ns = map snd $ before ++ after
-                  new = foldl1 (+) ns
-              in (v, new)

-            add [] n = [(n, 1)]
-            add ((hw, hc):hs) n
-              | n == hw = (hw, hc + 1):hs
-              | otherwise = (hw, hc):add hs n
-
-        wordfrequency = foldl' iter [] ws
+        -- training data: generate input and output pairs for each word and the words in it's window
+        trainingData = concatMap (\wds -> concatMap (iter wds) $ zip [0..] wds) wd
          where
-            iter acc w =
-              let i = findIndex ((== w) . fst) acc
+            iter wds (i, w) =
+              let v = snd . fromJust . find ((==w) . fst) $ vocvec
+                  before = take (window w2v) . drop (i - window w2v) $ wds
+                  after = take (window w2v) . drop (i + 1) $ wds
+                  ns 
+                    | i == 0 = after
+                    | i == length vocvec - 1 = before
+                    | otherwise = before ++ after
+                  vectorized = map (\w -> snd . fromJust $ find ((== w) . fst) vocvec) ns
+                  new = foldl1 (+) vectorized
              in
-                if isJust i then
-                  let idx = fromJust i
-                  in take idx acc ++ [(w, snd (acc !! idx) + 1)] ++ drop (idx + 1) acc
-                else
-                  acc ++ [(w, 1)]
+                case method w2v of
+                  SkipGram -> zip (repeat v) vectorized
+                  CBOW     -> zip vectorized (repeat v)
+                  _        -> error "unsupported word2vec method"
+
+    cleanText :: String -> String
+    cleanText string =
+      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?', '\'']) (trim string)
+          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
+          nonumber = filter (not . isNumber) spacify
+          lower = map toLower nonumber
+      in (unwords . words) lower -- remove unnecessary spaces
+      where
+        trim = f . f
+          where
+            f = reverse . dropWhile isSpace
+        replace needle replacement =
+          map (\c -> if c == needle then replacement else c)