feat(blogs-corpus): new corpus for word2vec
This commit is contained in:
		
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							@@ -4,3 +4,6 @@
 | 
			
		||||
[submodule "examples/notMNIST"]
 | 
			
		||||
	path = examples/notMNIST
 | 
			
		||||
	url = git@github.com:mdibaiee/notMNIST
 | 
			
		||||
[submodule "examples/blogs-corpus"]
 | 
			
		||||
	path = examples/blogs-corpus
 | 
			
		||||
	url = git@github.com:mdibaiee/blogs-corpus
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1
									
								
								examples/blogs-corpus
									
									
									
									
									
										Submodule
									
								
							
							
								
								
								
								
								
							
						
						
									
										1
									
								
								examples/blogs-corpus
									
									
									
									
									
										Submodule
									
								
							 Submodule examples/blogs-corpus added at f242d6e602
									
								
							@@ -12,27 +12,45 @@ module Main where
 | 
			
		||||
  import Data.Function (on)
 | 
			
		||||
  import Numeric.LinearAlgebra
 | 
			
		||||
  import System.IO
 | 
			
		||||
  import System.Directory
 | 
			
		||||
  import Data.List.Split
 | 
			
		||||
  import Control.Exception (evaluate)
 | 
			
		||||
  import Debug.Trace
 | 
			
		||||
  import Data.Char
 | 
			
		||||
  import System.Random
 | 
			
		||||
 | 
			
		||||
  rf :: FilePath -> IO String
 | 
			
		||||
  rf p = do
 | 
			
		||||
    hs <- openFile p ReadMode
 | 
			
		||||
    hSetEncoding hs latin1
 | 
			
		||||
    content <- evaluate =<< hGetContents hs
 | 
			
		||||
    length content `seq` hClose hs
 | 
			
		||||
    return content
 | 
			
		||||
 | 
			
		||||
  main = do
 | 
			
		||||
    setStdGen (mkStdGen 100)
 | 
			
		||||
    sws <- lines <$> readFile "examples/stopwords"
 | 
			
		||||
    {-ds <- do
 | 
			
		||||
        content <- readFile "examples/doc-classifier-data/data-reuters"
 | 
			
		||||
        let splitted = splitOn (replicate 10 '-' ++ "\n") content
 | 
			
		||||
            d = concatMap (tail . lines) (take 100 splitted)
 | 
			
		||||
        return $ removeWords sws d-}
 | 
			
		||||
    --let ds = ["I like deep learning", "I like NLP", "I enjoy flying"]
 | 
			
		||||
 | 
			
		||||
    -- real data, takes a lot of time to train
 | 
			
		||||
    {-ds <- do-}
 | 
			
		||||
        {-files <- filter ((/= "xml") . take 3 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
 | 
			
		||||
        {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
 | 
			
		||||
 | 
			
		||||
        {-let texts = map (unwords . splitOn " ") contents-}
 | 
			
		||||
        {-let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", " "]-}
 | 
			
		||||
        {-return $ map cleanText $ removeWords (sws ++ tags) texts-}
 | 
			
		||||
 | 
			
		||||
    let ds = ["the king loves the queen", "the queen loves the king",
 | 
			
		||||
              "the dwarf hates the king", "the queen hates the dwarf",
 | 
			
		||||
              "the dwarf poisons the king", "the dwarf poisons the queen"]
 | 
			
		||||
 | 
			
		||||
    let session = def { learningRate = 5e-2
 | 
			
		||||
    let session = def { learningRate = 1e-1
 | 
			
		||||
                      , batchSize = 1
 | 
			
		||||
                      , epochs = 100
 | 
			
		||||
                      , epochs = 200
 | 
			
		||||
                      , debug = True
 | 
			
		||||
                      } :: Session
 | 
			
		||||
        w2v = def { docs = ds
 | 
			
		||||
                  , dimensions = 50
 | 
			
		||||
                  , dimensions = 25
 | 
			
		||||
                  , method = SkipGram
 | 
			
		||||
                  , window = 2
 | 
			
		||||
                  } :: Word2Vec
 | 
			
		||||
@@ -49,10 +67,25 @@ module Main where
 | 
			
		||||
 | 
			
		||||
    return ()
 | 
			
		||||
 | 
			
		||||
  cleanText :: String -> String
 | 
			
		||||
  cleanText string = 
 | 
			
		||||
    let notag = unwords $ filter ((/= "<date>") . take 6) (words string)
 | 
			
		||||
        ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
 | 
			
		||||
        spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']
 | 
			
		||||
        nonumber = filter (not . isNumber) spacify
 | 
			
		||||
        lower = map toLower nonumber
 | 
			
		||||
    in unwords . words $ lower
 | 
			
		||||
    where
 | 
			
		||||
      trim = f . f
 | 
			
		||||
        where
 | 
			
		||||
          f = reverse . dropWhile isSpace
 | 
			
		||||
      replace needle replacement =
 | 
			
		||||
        map (\c -> if c == needle then replacement else c)
 | 
			
		||||
 | 
			
		||||
  removeWords :: [String] -> [String] -> [String]
 | 
			
		||||
  removeWords ws documents =
 | 
			
		||||
    map (rm ws) documents
 | 
			
		||||
    map rm documents
 | 
			
		||||
    where
 | 
			
		||||
        rm list text =
 | 
			
		||||
          unwords $ filter (`notElem` list) (words text)
 | 
			
		||||
        rm text = 
 | 
			
		||||
          unwords $ filter (`notElem` ws) (words text)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -16,6 +16,7 @@ cabal-version:       >=1.10
 | 
			
		||||
library
 | 
			
		||||
  hs-source-dirs:      src
 | 
			
		||||
  exposed-modules:     Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
 | 
			
		||||
  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
 | 
			
		||||
  build-depends:       base >= 4.7 && < 5
 | 
			
		||||
                     , hmatrix
 | 
			
		||||
                     , random
 | 
			
		||||
@@ -53,6 +54,8 @@ executable example-word2vec
 | 
			
		||||
                     , data-default-class
 | 
			
		||||
                     , split
 | 
			
		||||
                     , vector
 | 
			
		||||
                     , directory
 | 
			
		||||
                     , random
 | 
			
		||||
  default-language:    Haskell2010
 | 
			
		||||
 | 
			
		||||
executable example-424
 | 
			
		||||
 
 | 
			
		||||
@@ -26,6 +26,7 @@ module Sibe
 | 
			
		||||
     sigmoid',
 | 
			
		||||
     softmax,
 | 
			
		||||
     softmax',
 | 
			
		||||
     sampledSoftmax,
 | 
			
		||||
     relu,
 | 
			
		||||
     relu',
 | 
			
		||||
     crossEntropy,
 | 
			
		||||
@@ -181,6 +182,12 @@ module Sibe
 | 
			
		||||
        where
 | 
			
		||||
          sig x = 1 / max (1 + exp (-x)) 1e-10
 | 
			
		||||
 | 
			
		||||
      -- used for negative sampling
 | 
			
		||||
      sampledSoftmax :: Int -> Vector Double -> Vector Double
 | 
			
		||||
      sampledSoftmax n x = cmap (\a -> exp a / s) x
 | 
			
		||||
        where
 | 
			
		||||
          s = V.sum . exp $ V.take n x
 | 
			
		||||
 | 
			
		||||
      relu :: Vector Double -> Vector Double
 | 
			
		||||
      relu = cmap (max 0.1)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -14,6 +14,7 @@ module Sibe.Word2Vec
 | 
			
		||||
    import Data.Default.Class
 | 
			
		||||
    import Data.Function (on)
 | 
			
		||||
    import Control.Monad
 | 
			
		||||
    import System.Random
 | 
			
		||||
 | 
			
		||||
    data W2VMethod = SkipGram | CBOW
 | 
			
		||||
    data Word2Vec = Word2Vec { docs :: [String]
 | 
			
		||||
@@ -27,8 +28,9 @@ module Sibe.Word2Vec
 | 
			
		||||
                     }
 | 
			
		||||
 | 
			
		||||
    word2vec w2v session = do
 | 
			
		||||
      seed <- newStdGen
 | 
			
		||||
      let s = session { training = trainingData
 | 
			
		||||
                      , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, one))
 | 
			
		||||
                      , network = randomNetwork 0 (-1, 1) v [(dimensions w2v, (id, one))] (v, (softmax, crossEntropy'))
 | 
			
		||||
                      }
 | 
			
		||||
 | 
			
		||||
      when (debug s) $ do
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user