rm(sin): remove sin example
fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation
This commit is contained in:
		@@ -1,35 +0,0 @@
 | 
				
			|||||||
module Main where
 | 
					 | 
				
			||||||
  import Sibe
 | 
					 | 
				
			||||||
  import Numeric.LinearAlgebra
 | 
					 | 
				
			||||||
  import Data.List
 | 
					 | 
				
			||||||
  import Debug.Trace
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  main = do
 | 
					 | 
				
			||||||
    let alpha = 0.5
 | 
					 | 
				
			||||||
        epochs = 1000
 | 
					 | 
				
			||||||
        a = (sigmoid, sigmoid')
 | 
					 | 
				
			||||||
        lo = (sigmoid, (\_ -> 1)) -- cross entropy
 | 
					 | 
				
			||||||
        -- a = (relu, relu')
 | 
					 | 
				
			||||||
        rnetwork = randomNetwork 0 (-1, 1) 1 [(50, a)] (1, lo)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        inputs = map (\a -> vector [a]) (reverse [0, 30, 45, 60, 90])
 | 
					 | 
				
			||||||
        labels = map (\deg -> vector $ [sin $ deg * pi/180]) (reverse [0, 30, 45, 60, 90])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        initial_cost = zipWith crossEntropy (map (`forward` rnetwork) inputs) labels
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    network <- run session inputs rnetwork labels alpha epochs
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    let results = map (`forward` network) inputs
 | 
					 | 
				
			||||||
        rounded = map (map round . toList) results
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        cost = zipWith crossEntropy (map (`forward` network) inputs) labels
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    putStrLn "parameters: "
 | 
					 | 
				
			||||||
    putStrLn $ "- inputs: " ++ show inputs
 | 
					 | 
				
			||||||
    putStrLn $ "- labels: " ++ show labels
 | 
					 | 
				
			||||||
    putStrLn $ "- learning rate: " ++ show alpha
 | 
					 | 
				
			||||||
    putStrLn $ "- epochs: " ++ show epochs
 | 
					 | 
				
			||||||
    {-putStrLn $ "- initial cost (cross-entropy): " ++ show initial_cost-}
 | 
					 | 
				
			||||||
    putStrLn "results: "
 | 
					 | 
				
			||||||
    putStrLn $ "- actual result: " ++ show results
 | 
					 | 
				
			||||||
    {-putStrLn $ "- cost (cross-entropy): " ++ show cost-}
 | 
					 | 
				
			||||||
@@ -15,20 +15,43 @@ module Main where
 | 
				
			|||||||
  import Data.List.Split
 | 
					  import Data.List.Split
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  main = do
 | 
					  main = do
 | 
				
			||||||
    {-ds <- do
 | 
					    sws <- lines <$> readFile "examples/stopwords"
 | 
				
			||||||
        content <- readFile "examples/doc-classifier-data/data-reuters"
 | 
					    {-ds <- do-}
 | 
				
			||||||
        let splitted = splitOn (replicate 10 '-' ++ "\n") content
 | 
					        {-content <- readFile "examples/doc-classifier-data/data-reuters"-}
 | 
				
			||||||
            d = concatMap (tail . lines) (take 100 splitted)
 | 
					        {-let splitted = splitOn (replicate 10 '-' ++ "\n") content-}
 | 
				
			||||||
        return d-}
 | 
					            {-d = concatMap (tail . lines) (take 100 splitted)-}
 | 
				
			||||||
    let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
 | 
					        {-return $ removeWords sws d-}
 | 
				
			||||||
 | 
					    --let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
 | 
				
			||||||
 | 
					    let ds = ["the king loves the queen", "the queen loves the king",
 | 
				
			||||||
 | 
					              "the dwarf hates the king", "the queen hates the dwarf",
 | 
				
			||||||
 | 
					              "the dwarf poisons the king", "the dwarf poisons the queen"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    let session = def { learningRate = 0.8
 | 
					    let session = def { learningRate = 0.1
 | 
				
			||||||
                      , batchSize = 10
 | 
					                      , batchSize = 16
 | 
				
			||||||
                      , epochs = 1000
 | 
					                      , epochs = 100
 | 
				
			||||||
                      } :: Session
 | 
					                      } :: Session
 | 
				
			||||||
        w2v = def { docs = ds }:: Word2Vec
 | 
					        w2v = def { docs = ds
 | 
				
			||||||
 | 
					                  , dimensions = 50
 | 
				
			||||||
 | 
					                  , method = SkipGram
 | 
				
			||||||
 | 
					                  , window = 3
 | 
				
			||||||
 | 
					                  } :: Word2Vec
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    r <- word2vec w2v session
 | 
					    (computed, vocvec) <- word2vec w2v session
 | 
				
			||||||
    {-print r-}
 | 
					    
 | 
				
			||||||
 | 
					    mapM_ (\(w, v) -> do
 | 
				
			||||||
 | 
					                    putStr $ w ++ ": "
 | 
				
			||||||
 | 
					                    let similarities = map (similarity v . snd) computed
 | 
				
			||||||
 | 
					                    let sorted = sortBy (compare `on` similarity v . snd) computed
 | 
				
			||||||
 | 
					                    print . take 2 . drop 1 . reverse $ map fst sorted
 | 
				
			||||||
 | 
					          ) computed
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return ()
 | 
					    return ()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  removeWords :: [String] -> [String] -> [String]
 | 
				
			||||||
 | 
					  removeWords ws documents =
 | 
				
			||||||
 | 
					    map (rm ws) documents
 | 
				
			||||||
 | 
					    where
 | 
				
			||||||
 | 
					        rm list text =
 | 
				
			||||||
 | 
					          unwords $ filter (`notElem` list) (words text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										18
									
								
								sibe.cabal
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								sibe.cabal
									
									
									
									
									
								
							@@ -33,15 +33,6 @@ library
 | 
				
			|||||||
                     , Chart-cairo
 | 
					                     , Chart-cairo
 | 
				
			||||||
  default-language:    Haskell2010
 | 
					  default-language:    Haskell2010
 | 
				
			||||||
 | 
					
 | 
				
			||||||
--executable sibe-exe
 | 
					 | 
				
			||||||
  --hs-source-dirs:      app
 | 
					 | 
				
			||||||
  --main-is:             Main.hs
 | 
					 | 
				
			||||||
  --ghc-options:         -threaded -rtsopts -with-rtsopts=-N
 | 
					 | 
				
			||||||
  --build-depends:       base
 | 
					 | 
				
			||||||
                     --, sibe
 | 
					 | 
				
			||||||
                     --, hmatrix
 | 
					 | 
				
			||||||
  --default-language:    Haskell2010
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
executable example-xor
 | 
					executable example-xor
 | 
				
			||||||
  hs-source-dirs:      examples
 | 
					  hs-source-dirs:      examples
 | 
				
			||||||
  main-is:             xor.hs
 | 
					  main-is:             xor.hs
 | 
				
			||||||
@@ -64,15 +55,6 @@ executable example-word2vec
 | 
				
			|||||||
                     , vector
 | 
					                     , vector
 | 
				
			||||||
  default-language:    Haskell2010
 | 
					  default-language:    Haskell2010
 | 
				
			||||||
 | 
					
 | 
				
			||||||
--executable example-sin
 | 
					 | 
				
			||||||
  --hs-source-dirs:      examples
 | 
					 | 
				
			||||||
  --main-is:             sin.hs
 | 
					 | 
				
			||||||
  --ghc-options:         -threaded -rtsopts -with-rtsopts=-N
 | 
					 | 
				
			||||||
  --build-depends:       base
 | 
					 | 
				
			||||||
                     --, sibe
 | 
					 | 
				
			||||||
                     --, hmatrix
 | 
					 | 
				
			||||||
  --default-language:    Haskell2010
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
executable example-424
 | 
					executable example-424
 | 
				
			||||||
  hs-source-dirs:      examples
 | 
					  hs-source-dirs:      examples
 | 
				
			||||||
  main-is:             424encoder.hs
 | 
					  main-is:             424encoder.hs
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										23
									
								
								src/Sibe.hs
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								src/Sibe.hs
									
									
									
									
									
								
							@@ -10,7 +10,9 @@ module Sibe
 | 
				
			|||||||
     Output,
 | 
					     Output,
 | 
				
			||||||
     Activation,
 | 
					     Activation,
 | 
				
			||||||
     forward,
 | 
					     forward,
 | 
				
			||||||
 | 
					     forward',
 | 
				
			||||||
     runLayer,
 | 
					     runLayer,
 | 
				
			||||||
 | 
					     runLayer',
 | 
				
			||||||
     randomLayer,
 | 
					     randomLayer,
 | 
				
			||||||
     randomNetwork,
 | 
					     randomNetwork,
 | 
				
			||||||
     buildNetwork,
 | 
					     buildNetwork,
 | 
				
			||||||
@@ -84,7 +86,6 @@ module Sibe
 | 
				
			|||||||
                              , batchSize    :: Int
 | 
					                              , batchSize    :: Int
 | 
				
			||||||
                              , chart        :: [(Int, Double, Double)]
 | 
					                              , chart        :: [(Int, Double, Double)]
 | 
				
			||||||
                              , momentum     :: Double
 | 
					                              , momentum     :: Double
 | 
				
			||||||
                              , biases       :: Bool
 | 
					 | 
				
			||||||
                              } deriving (Show)
 | 
					                              } deriving (Show)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      emptyNetwork = randomNetwork 0 (0, 0) 0 [] (0, (id, id))
 | 
					      emptyNetwork = randomNetwork 0 (0, 0) 0 [] (0, (id, id))
 | 
				
			||||||
@@ -98,7 +99,6 @@ module Sibe
 | 
				
			|||||||
                      , batchSize    = 0
 | 
					                      , batchSize    = 0
 | 
				
			||||||
                      , chart        = []
 | 
					                      , chart        = []
 | 
				
			||||||
                      , momentum     = 0
 | 
					                      , momentum     = 0
 | 
				
			||||||
                      , biases       = True
 | 
					 | 
				
			||||||
                      }
 | 
					                      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      saveNetwork :: Network -> String -> IO ()
 | 
					      saveNetwork :: Network -> String -> IO ()
 | 
				
			||||||
@@ -133,6 +133,12 @@ module Sibe
 | 
				
			|||||||
          compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l
 | 
					          compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer input l
 | 
				
			||||||
          compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer input l) n
 | 
					          compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer input l) n
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      forward' :: Input -> Session -> Output
 | 
				
			||||||
 | 
					      forward' input session = compute input (network session)
 | 
				
			||||||
 | 
					        where
 | 
				
			||||||
 | 
					          compute input (O l@(Layer _ _ (fn, _))) = fn $ runLayer' input l
 | 
				
			||||||
 | 
					          compute input (l@(Layer _ _ (fn, _)) :- n) = compute ((fst . activation $ l) $ runLayer' input l) n
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      randomLayer :: Seed -> (Int, Int) -> (Double, Double) -> Activation -> Layer
 | 
					      randomLayer :: Seed -> (Int, Int) -> (Double, Double) -> Activation -> Layer
 | 
				
			||||||
      randomLayer seed (wr, wc) (l, u) =
 | 
					      randomLayer seed (wr, wc) (l, u) =
 | 
				
			||||||
        let weights = uniformSample seed wr $ replicate wc (l, u)
 | 
					        let weights = uniformSample seed wr $ replicate wc (l, u)
 | 
				
			||||||
@@ -209,14 +215,12 @@ module Sibe
 | 
				
			|||||||
                o = fn y
 | 
					                o = fn y
 | 
				
			||||||
                delta = o - target 
 | 
					                delta = o - target 
 | 
				
			||||||
                de = delta * fn' y
 | 
					                de = delta * fn' y
 | 
				
			||||||
                -- de = delta / fromIntegral (V.length o) -- cross entropy cost
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
                biases'  = biases  - scale alpha de
 | 
					                biases'  = biases  - scale alpha de
 | 
				
			||||||
                weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
 | 
					                weights' = weights - scale alpha (input `outer` de) -- small inputs learn slowly
 | 
				
			||||||
                layer    = Layer biases' weights' (fn, fn') -- updated layer
 | 
					                layer    = Layer biases' weights' (fn, fn') -- updated layer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                pass = weights #> de
 | 
					                pass = weights #> de
 | 
				
			||||||
                -- pass = weights #> de
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            in (O layer, pass)
 | 
					            in (O layer, pass)
 | 
				
			||||||
          run input (l@(Layer biases weights (fn, fn')) :- n) =
 | 
					          run input (l@(Layer biases weights (fn, fn')) :- n) =
 | 
				
			||||||
@@ -226,12 +230,11 @@ module Sibe
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
                de = delta * fn' y
 | 
					                de = delta * fn' y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                biases'  = biases  - cmap (*alpha) de
 | 
					                biases'  = biases  - scale alpha de
 | 
				
			||||||
                weights' = weights - cmap (*alpha) (input `outer` de)
 | 
					                weights' = weights - scale alpha (input `outer` de)
 | 
				
			||||||
                layer = Layer biases' weights' (fn, fn')
 | 
					                layer = Layer biases' weights' (fn, fn')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                pass = weights #> de
 | 
					                pass = weights #> de
 | 
				
			||||||
                -- pass = weights #> de
 | 
					 | 
				
			||||||
            in (layer :- n', pass)
 | 
					            in (layer :- n', pass)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      gd :: Session -> IO Session
 | 
					      gd :: Session -> IO Session
 | 
				
			||||||
@@ -280,8 +283,6 @@ module Sibe
 | 
				
			|||||||
        let el = map (\(e, l, _) -> (e, l)) (chart session)
 | 
					        let el = map (\(e, l, _) -> (e, l)) (chart session)
 | 
				
			||||||
            ea = map (\(e, _, a) -> (e, a)) (chart session)
 | 
					            ea = map (\(e, _, a) -> (e, a)) (chart session)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        putStrLn $ (show $ epoch session) ++ " => " ++ (show cost) ++ " @ " ++ (show $ learningRate session)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        toFile Chart.def "sgd.png" $ do
 | 
					        toFile Chart.def "sgd.png" $ do
 | 
				
			||||||
          Chart.layoutlr_title Chart..= "loss over time"
 | 
					          Chart.layoutlr_title Chart..= "loss over time"
 | 
				
			||||||
          Chart.plotLeft (Chart.line "loss" [el])
 | 
					          Chart.plotLeft (Chart.line "loss" [el])
 | 
				
			||||||
@@ -312,8 +313,8 @@ module Sibe
 | 
				
			|||||||
      ignoreBiases session =
 | 
					      ignoreBiases session =
 | 
				
			||||||
        session { network = rmbias (network session) }
 | 
					        session { network = rmbias (network session) }
 | 
				
			||||||
        where
 | 
					        where
 | 
				
			||||||
          rmbias (O (Layer nodes biases a)) = O $ Layer nodes (biases * 0) a
 | 
					          rmbias (O (Layer biases nodes a)) = O $ Layer (biases * 0) nodes a
 | 
				
			||||||
          rmbias ((Layer nodes biases a) :- n) = Layer nodes (biases * 0) a :- rmbias n
 | 
					          rmbias ((Layer biases nodes a) :- n) = Layer (biases * 0) nodes a :- rmbias n
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      run :: (Session -> IO Session)
 | 
					      run :: (Session -> IO Session)
 | 
				
			||||||
          ->  Session -> IO Session
 | 
					          ->  Session -> IO Session
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,9 +1,9 @@
 | 
				
			|||||||
module Sibe.Word2Vec
 | 
					module Sibe.Word2Vec
 | 
				
			||||||
  (word2vec,
 | 
					  ( word2vec
 | 
				
			||||||
   Word2Vec (..)
 | 
					  , Word2Vec (..)
 | 
				
			||||||
 | 
					  , W2VMethod (..)
 | 
				
			||||||
  ) where
 | 
					  ) where
 | 
				
			||||||
    import Sibe
 | 
					    import Sibe
 | 
				
			||||||
    import Sibe.NLP
 | 
					 | 
				
			||||||
    import Sibe.Utils
 | 
					    import Sibe.Utils
 | 
				
			||||||
    import Debug.Trace
 | 
					    import Debug.Trace
 | 
				
			||||||
    import Data.Char
 | 
					    import Data.Char
 | 
				
			||||||
@@ -14,8 +14,11 @@ module Sibe.Word2Vec
 | 
				
			|||||||
    import Data.Default.Class
 | 
					    import Data.Default.Class
 | 
				
			||||||
    import Data.Function (on)
 | 
					    import Data.Function (on)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    data W2VMethod = SkipGram | CBOW
 | 
				
			||||||
    data Word2Vec = Word2Vec { docs :: [String]
 | 
					    data Word2Vec = Word2Vec { docs :: [String]
 | 
				
			||||||
                             , window :: Int
 | 
					                             , window :: Int
 | 
				
			||||||
 | 
					                             , dimensions :: Int
 | 
				
			||||||
 | 
					                             , method :: W2VMethod
 | 
				
			||||||
                             }
 | 
					                             }
 | 
				
			||||||
    instance Default Word2Vec where
 | 
					    instance Default Word2Vec where
 | 
				
			||||||
      def = Word2Vec { docs = []
 | 
					      def = Word2Vec { docs = []
 | 
				
			||||||
@@ -23,83 +26,70 @@ module Sibe.Word2Vec
 | 
				
			|||||||
                     }
 | 
					                     }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    word2vec w2v session = do
 | 
					    word2vec w2v session = do
 | 
				
			||||||
      return trainingData
 | 
					 | 
				
			||||||
      let s = session { training = trainingData
 | 
					      let s = session { training = trainingData
 | 
				
			||||||
                      , network = buildNetwork 0 (-1, 1) v [(v, 25, (id, one))] (20, v, (softmax, crossEntropy'))
 | 
					                      , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one))
 | 
				
			||||||
                      , biases = False
 | 
					 | 
				
			||||||
                      }
 | 
					                      }
 | 
				
			||||||
      print trainingData
 | 
					 | 
				
			||||||
      newses <- run (gd . learningRateDecay (1.1, 0.1)) s
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      putStr "vocabulary size: "
 | 
				
			||||||
 | 
					      print v
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      putStr "trainingData length: "
 | 
				
			||||||
 | 
					      print . length $ trainingData
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      -- biases are not used in skipgram/cbow
 | 
				
			||||||
 | 
					      newses <- run (sgd . ignoreBiases) s
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      -- export the hidden layer
 | 
				
			||||||
      let (hidden@(Layer biases nodes _) :- _) = network newses
 | 
					      let (hidden@(Layer biases nodes _) :- _) = network newses
 | 
				
			||||||
      {-let computedVocVec = map (\(w, v) -> (w, forward v newses)) vocvec-}
 | 
					      -- run words through the hidden layer alone to get the word vector
 | 
				
			||||||
      print biases
 | 
					      let computedVocVec = map (\(w, v) -> (w, runLayer' v hidden)) vocvec
 | 
				
			||||||
      let computedVocVec = map (\(w, v) -> (w, v <# nodes)) vocvec
 | 
					 | 
				
			||||||
      {-print computedVocVec-}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      {-mapM_ (\(w, v) -> do
 | 
					      return (computedVocVec, vocvec)
 | 
				
			||||||
                      putStr $ w ++ ": "
 | 
					 | 
				
			||||||
                      let similarities = map (similarity v . snd) computedVocVec
 | 
					 | 
				
			||||||
                      let sorted = sortBy (compare `on` similarity v . snd) computedVocVec
 | 
					 | 
				
			||||||
                      {-print $ zip (map fst sorted) similarities-}
 | 
					 | 
				
			||||||
                      print . take 2 . drop 1 . reverse $ map fst sorted
 | 
					 | 
				
			||||||
            ) computedVocVec-}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      return newses
 | 
					 | 
				
			||||||
      where
 | 
					      where
 | 
				
			||||||
        ws = words (concatMap ((++ " <start> ") . map toLower) (docs w2v))
 | 
					        -- clean documents
 | 
				
			||||||
 | 
					        ds = map cleanText (docs w2v)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        -- words of each document
 | 
				
			||||||
 | 
					        wd = map (words . (++ " ") . (map toLower)) ds
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        -- all words together, used to generate the vocabulary
 | 
				
			||||||
 | 
					        ws = words (concatMap ((++ " ") . map toLower) ds)
 | 
				
			||||||
        vocabulary = ordNub ws
 | 
					        vocabulary = ordNub ws
 | 
				
			||||||
        v = length vocabulary
 | 
					        v = length vocabulary
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cooccurence = foldl' iter [] (zip [0..] ws)
 | 
					        -- generate one-hot vectors for each word of vocabulary
 | 
				
			||||||
          where
 | 
					 | 
				
			||||||
            iter acc (i, w) =
 | 
					 | 
				
			||||||
              let a = findIndex ((== w) . fst) acc
 | 
					 | 
				
			||||||
                  before = take (window w2v) . drop (i - window w2v) $ ws
 | 
					 | 
				
			||||||
                  after = take (window w2v) . drop (i + 1) $ ws
 | 
					 | 
				
			||||||
                  ns = if i == 0 then after else before ++ after
 | 
					 | 
				
			||||||
              in
 | 
					 | 
				
			||||||
                if isJust a then
 | 
					 | 
				
			||||||
                  let idx = fromJust a
 | 
					 | 
				
			||||||
                      new = foldl (\acc n -> add acc n) (snd $ acc !! idx) ns
 | 
					 | 
				
			||||||
                  in take idx acc ++ [(w, new)] ++ drop (idx + 1) acc
 | 
					 | 
				
			||||||
                else
 | 
					 | 
				
			||||||
                  acc ++ [(w, map (\n -> (n, 1)) ns)]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            add [] n = [(n, 1)]
 | 
					 | 
				
			||||||
            add ((hw, hc):hs) n
 | 
					 | 
				
			||||||
              | n == hw = (hw, hc + 1):hs
 | 
					 | 
				
			||||||
              | otherwise = (hw, hc):add hs n
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        vocvec = zip vocabulary $ map (onehot v) [0..v - 1]
 | 
					        vocvec = zip vocabulary $ map (onehot v) [0..v - 1]
 | 
				
			||||||
        {-trainingData = map iter cooccurence
 | 
					 | 
				
			||||||
          where
 | 
					 | 
				
			||||||
            iter (w, targets) =
 | 
					 | 
				
			||||||
              let ts = map (\(w, c) -> c * (snd . fromJust $ find ((== w) . fst) vocvec)) targets
 | 
					 | 
				
			||||||
                  folded = foldl (+) (vector $ replicate v 0) ts
 | 
					 | 
				
			||||||
                  input = snd . fromJust $ find ((== w) . fst) vocvec
 | 
					 | 
				
			||||||
              in (input, folded)-}
 | 
					 | 
				
			||||||
        trainingData = map iter $ zip [window w2v..length vocvec - window w2v] vocvec
 | 
					 | 
				
			||||||
          where
 | 
					 | 
				
			||||||
            iter (i, (w, v)) =
 | 
					 | 
				
			||||||
              let before = take (window w2v) . drop (i - window w2v) $ vocvec
 | 
					 | 
				
			||||||
                  after = take (window w2v) . drop (i + 1) $ vocvec
 | 
					 | 
				
			||||||
                  ns = map snd $ before ++ after
 | 
					 | 
				
			||||||
                  new = foldl1 (+) ns
 | 
					 | 
				
			||||||
              in (v, new)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            add [] n = [(n, 1)]
 | 
					        -- training data: generate input and output pairs for each word and the words in it's window
 | 
				
			||||||
            add ((hw, hc):hs) n
 | 
					        trainingData = concatMap (\wds -> concatMap (iter wds) $ zip [0..] wds) wd
 | 
				
			||||||
              | n == hw = (hw, hc + 1):hs
 | 
					 | 
				
			||||||
              | otherwise = (hw, hc):add hs n
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        wordfrequency = foldl' iter [] ws
 | 
					 | 
				
			||||||
          where
 | 
					          where
 | 
				
			||||||
            iter acc w =
 | 
					            iter wds (i, w) =
 | 
				
			||||||
              let i = findIndex ((== w) . fst) acc
 | 
					              let v = snd . fromJust . find ((==w) . fst) $ vocvec
 | 
				
			||||||
 | 
					                  before = take (window w2v) . drop (i - window w2v) $ wds
 | 
				
			||||||
 | 
					                  after = take (window w2v) . drop (i + 1) $ wds
 | 
				
			||||||
 | 
					                  ns 
 | 
				
			||||||
 | 
					                    | i == 0 = after
 | 
				
			||||||
 | 
					                    | i == length vocvec - 1 = before
 | 
				
			||||||
 | 
					                    | otherwise = before ++ after
 | 
				
			||||||
 | 
					                  vectorized = map (\w -> snd . fromJust $ find ((== w) . fst) vocvec) ns
 | 
				
			||||||
 | 
					                  new = foldl1 (+) vectorized
 | 
				
			||||||
              in
 | 
					              in
 | 
				
			||||||
                if isJust i then
 | 
					                case method w2v of
 | 
				
			||||||
                  let idx = fromJust i
 | 
					                  SkipGram -> zip (repeat v) vectorized
 | 
				
			||||||
                  in take idx acc ++ [(w, snd (acc !! idx) + 1)] ++ drop (idx + 1) acc
 | 
					                  CBOW     -> zip vectorized (repeat v)
 | 
				
			||||||
                else
 | 
					                  _        -> error "unsupported word2vec method"
 | 
				
			||||||
                  acc ++ [(w, 1)]
 | 
					
 | 
				
			||||||
 | 
					    cleanText :: String -> String
 | 
				
			||||||
 | 
					    cleanText string =
 | 
				
			||||||
 | 
					      let puncs = filter (`notElem` ['!', '"', '#', '$', '%', '(', ')', '.', '?', '\'']) (trim string)
 | 
				
			||||||
 | 
					          spacify = foldl (\acc x -> replace x ' ' acc) puncs [',', '/', '-', '\n', '\r']
 | 
				
			||||||
 | 
					          nonumber = filter (not . isNumber) spacify
 | 
				
			||||||
 | 
					          lower = map toLower nonumber
 | 
				
			||||||
 | 
					      in (unwords . words) lower -- remove unnecessary spaces
 | 
				
			||||||
 | 
					      where
 | 
				
			||||||
 | 
					        trim = f . f
 | 
				
			||||||
 | 
					          where
 | 
				
			||||||
 | 
					            f = reverse . dropWhile isSpace
 | 
				
			||||||
 | 
					        replace needle replacement =
 | 
				
			||||||
 | 
					          map (\c -> if c == needle then replacement else c)
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user