feat(blogs-corpus): new corpus for word2vec
This commit is contained in:
		
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							@@ -4,3 +4,6 @@
 | 
				
			|||||||
[submodule "examples/notMNIST"]
 | 
					[submodule "examples/notMNIST"]
 | 
				
			||||||
	path = examples/notMNIST
 | 
						path = examples/notMNIST
 | 
				
			||||||
	url = git@github.com:mdibaiee/notMNIST
 | 
						url = git@github.com:mdibaiee/notMNIST
 | 
				
			||||||
 | 
					[submodule "examples/blogs-corpus"]
 | 
				
			||||||
 | 
						path = examples/blogs-corpus
 | 
				
			||||||
 | 
						url = git@github.com:mdibaiee/blogs-corpus
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										1
									
								
								examples/blogs-corpus
									
									
									
									
									
										Submodule
									
								
							
							
								
								
								
								
								
							
						
						
									
										1
									
								
								examples/blogs-corpus
									
									
									
									
									
										Submodule
									
								
							 Submodule examples/blogs-corpus added at f242d6e602
									
								
							@@ -12,27 +12,45 @@ module Main where
 | 
				
			|||||||
  import Data.Function (on)
 | 
					  import Data.Function (on)
 | 
				
			||||||
  import Numeric.LinearAlgebra
 | 
					  import Numeric.LinearAlgebra
 | 
				
			||||||
  import System.IO
 | 
					  import System.IO
 | 
				
			||||||
 | 
					  import System.Directory
 | 
				
			||||||
  import Data.List.Split
 | 
					  import Data.List.Split
 | 
				
			||||||
 | 
					  import Control.Exception (evaluate)
 | 
				
			||||||
 | 
					  import Debug.Trace
 | 
				
			||||||
 | 
					  import Data.Char
 | 
				
			||||||
 | 
					  import System.Random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  rf :: FilePath -> IO String
 | 
				
			||||||
 | 
					  rf p = do
 | 
				
			||||||
 | 
					    hs <- openFile p ReadMode
 | 
				
			||||||
 | 
					    hSetEncoding hs latin1
 | 
				
			||||||
 | 
					    content <- evaluate =<< hGetContents hs
 | 
				
			||||||
 | 
					    length content `seq` hClose hs
 | 
				
			||||||
 | 
					    return content
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  main = do
 | 
					  main = do
 | 
				
			||||||
 | 
					    setStdGen (mkStdGen 100)
 | 
				
			||||||
    sws <- lines <$> readFile "examples/stopwords"
 | 
					    sws <- lines <$> readFile "examples/stopwords"
 | 
				
			||||||
    {-ds <- do
 | 
					
 | 
				
			||||||
        content <- readFile "examples/doc-classifier-data/data-reuters"
 | 
					    -- real data, takes a lot of time to train
 | 
				
			||||||
        let splitted = splitOn (replicate 10 '-' ++ "\n") content
 | 
					    {-ds <- do-}
 | 
				
			||||||
            d = concatMap (tail . lines) (take 100 splitted)
 | 
					        {-files <- filter ((/= "xml") . take 3 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
 | 
				
			||||||
        return $ removeWords sws d-}
 | 
					        {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
 | 
				
			||||||
    --let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
 | 
					
 | 
				
			||||||
 | 
					        {-let texts = map (unwords . splitOn " ") contents-}
 | 
				
			||||||
 | 
					        {-let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", " "]-}
 | 
				
			||||||
 | 
					        {-return $ map cleanText $ removeWords (sws ++ tags) texts-}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    let ds = ["the king loves the queen", "the queen loves the king",
 | 
					    let ds = ["the king loves the queen", "the queen loves the king",
 | 
				
			||||||
              "the dwarf hates the king", "the queen hates the dwarf",
 | 
					              "the dwarf hates the king", "the queen hates the dwarf",
 | 
				
			||||||
              "the dwarf poisons the king", "the dwarf poisons the queen"]
 | 
					              "the dwarf poisons the king", "the dwarf poisons the queen"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    let session = def { learningRate = 5e-2
 | 
					    let session = def { learningRate = 1e-1
 | 
				
			||||||
                      , batchSize = 1
 | 
					                      , batchSize = 1
 | 
				
			||||||
                      , epochs = 100
 | 
					                      , epochs = 200
 | 
				
			||||||
                      , debug = True
 | 
					                      , debug = True
 | 
				
			||||||
                      } :: Session
 | 
					                      } :: Session
 | 
				
			||||||
        w2v = def { docs = ds
 | 
					        w2v = def { docs = ds
 | 
				
			||||||
                  , dimensions = 50
 | 
					                  , dimensions = 25
 | 
				
			||||||
                  , method = SkipGram
 | 
					                  , method = SkipGram
 | 
				
			||||||
                  , window = 2
 | 
					                  , window = 2
 | 
				
			||||||
                  } :: Word2Vec
 | 
					                  } :: Word2Vec
 | 
				
			||||||
@@ -49,10 +67,25 @@ module Main where
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    return ()
 | 
					    return ()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  cleanText :: String -> String
 | 
				
			||||||
 | 
					  cleanText string = 
 | 
				
			||||||
 | 
					    let notag = unwords $ filter ((/= "<date>") . take 6) (words string)
 | 
				
			||||||
 | 
					        ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
 | 
				
			||||||
 | 
					        spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']
 | 
				
			||||||
 | 
					        nonumber = filter (not . isNumber) spacify
 | 
				
			||||||
 | 
					        lower = map toLower nonumber
 | 
				
			||||||
 | 
					    in unwords . words $ lower
 | 
				
			||||||
 | 
					    where
 | 
				
			||||||
 | 
					      trim = f . f
 | 
				
			||||||
 | 
					        where
 | 
				
			||||||
 | 
					          f = reverse . dropWhile isSpace
 | 
				
			||||||
 | 
					      replace needle replacement =
 | 
				
			||||||
 | 
					        map (\c -> if c == needle then replacement else c)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  removeWords :: [String] -> [String] -> [String]
 | 
					  removeWords :: [String] -> [String] -> [String]
 | 
				
			||||||
  removeWords ws documents =
 | 
					  removeWords ws documents =
 | 
				
			||||||
    map (rm ws) documents
 | 
					    map rm documents
 | 
				
			||||||
    where
 | 
					    where
 | 
				
			||||||
        rm list text =
 | 
					        rm text = 
 | 
				
			||||||
          unwords $ filter (`notElem` list) (words text)
 | 
					          unwords $ filter (`notElem` ws) (words text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -16,6 +16,7 @@ cabal-version:       >=1.10
 | 
				
			|||||||
library
 | 
					library
 | 
				
			||||||
  hs-source-dirs:      src
 | 
					  hs-source-dirs:      src
 | 
				
			||||||
  exposed-modules:     Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
 | 
					  exposed-modules:     Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
 | 
				
			||||||
 | 
					  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
 | 
				
			||||||
  build-depends:       base >= 4.7 && < 5
 | 
					  build-depends:       base >= 4.7 && < 5
 | 
				
			||||||
                     , hmatrix
 | 
					                     , hmatrix
 | 
				
			||||||
                     , random
 | 
					                     , random
 | 
				
			||||||
@@ -53,6 +54,8 @@ executable example-word2vec
 | 
				
			|||||||
                     , data-default-class
 | 
					                     , data-default-class
 | 
				
			||||||
                     , split
 | 
					                     , split
 | 
				
			||||||
                     , vector
 | 
					                     , vector
 | 
				
			||||||
 | 
					                     , directory
 | 
				
			||||||
 | 
					                     , random
 | 
				
			||||||
  default-language:    Haskell2010
 | 
					  default-language:    Haskell2010
 | 
				
			||||||
 | 
					
 | 
				
			||||||
executable example-424
 | 
					executable example-424
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -26,6 +26,7 @@ module Sibe
 | 
				
			|||||||
     sigmoid',
 | 
					     sigmoid',
 | 
				
			||||||
     softmax,
 | 
					     softmax,
 | 
				
			||||||
     softmax',
 | 
					     softmax',
 | 
				
			||||||
 | 
					     sampledSoftmax,
 | 
				
			||||||
     relu,
 | 
					     relu,
 | 
				
			||||||
     relu',
 | 
					     relu',
 | 
				
			||||||
     crossEntropy,
 | 
					     crossEntropy,
 | 
				
			||||||
@@ -181,6 +182,12 @@ module Sibe
 | 
				
			|||||||
        where
 | 
					        where
 | 
				
			||||||
          sig x = 1 / max (1 + exp (-x)) 1e-10
 | 
					          sig x = 1 / max (1 + exp (-x)) 1e-10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      -- used for negative sampling
 | 
				
			||||||
 | 
					      sampledSoftmax :: Int -> Vector Double -> Vector Double
 | 
				
			||||||
 | 
					      sampledSoftmax n x = cmap (\a -> exp a / s) x
 | 
				
			||||||
 | 
					        where
 | 
				
			||||||
 | 
					          s = V.sum . exp $ V.take n x
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      relu :: Vector Double -> Vector Double
 | 
					      relu :: Vector Double -> Vector Double
 | 
				
			||||||
      relu = cmap (max 0.1)
 | 
					      relu = cmap (max 0.1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -14,6 +14,7 @@ module Sibe.Word2Vec
 | 
				
			|||||||
    import Data.Default.Class
 | 
					    import Data.Default.Class
 | 
				
			||||||
    import Data.Function (on)
 | 
					    import Data.Function (on)
 | 
				
			||||||
    import Control.Monad
 | 
					    import Control.Monad
 | 
				
			||||||
 | 
					    import System.Random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data W2VMethod = SkipGram | CBOW
 | 
					    data W2VMethod = SkipGram | CBOW
 | 
				
			||||||
    data Word2Vec = Word2Vec { docs :: [String]
 | 
					    data Word2Vec = Word2Vec { docs :: [String]
 | 
				
			||||||
@@ -27,8 +28,9 @@ module Sibe.Word2Vec
 | 
				
			|||||||
                     }
 | 
					                     }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    word2vec w2v session = do
 | 
					    word2vec w2v session = do
 | 
				
			||||||
 | 
					      seed <- newStdGen
 | 
				
			||||||
      let s = session { training = trainingData
 | 
					      let s = session { training = trainingData
 | 
				
			||||||
                      , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one))
 | 
					                      , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, crossEntropy'))
 | 
				
			||||||
                      }
 | 
					                      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      when (debug s) $ do
 | 
					      when (debug s) $ do
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user