sibe/examples/word2vec.hs

{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ScopedTypeVariables #-}

module Main where
  import Numeric.Sibe
  import Numeric.Sibe.Word2Vec
  import Numeric.Sibe.Utils
  import Data.Default.Class
  import qualified Data.Vector.Storable as V
  import Data.List (sortBy)
  import Data.Function (on)
  import Numeric.LinearAlgebra
  import System.IO
  import System.Directory
  import Data.List.Split
  import Control.Exception (evaluate)
  import Debug.Trace
  import Data.Char
  import System.Random

  rf :: FilePath -> IO String
  rf p = do
    hs <- openFile p ReadMode
    hSetEncoding hs latin1
    content <- evaluate =<< hGetContents hs
    length content `seq` hClose hs
    return content

  main = do
    setStdGen (mkStdGen 100)
    sws <- lines <$> readFile "examples/stopwords"

    -- real data, takes a lot of time to train
    {-ds <- do-}
        {-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
        {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}

        {-let texts = map (unwords . splitOn "&nbsp;") contents-}
        {-let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", "&nbsp;"]-}
        {-return $ map cleanText $ removeWords (sws ++ tags) texts-}

    let ds = ["the king loves the queen", "the queen loves the king",
              "the dwarf hates the king", "the queen hates the dwarf",
              "the dwarf poisons the king", "the dwarf poisons the queen",
              "the man loves the woman", "the woman loves the man",
              "the thief hates the man", "the woman hates the thief",
              "the thief robs the man", "the thief robs the woman"]

    let session = def { learningRate = 0.1
                      , batchSize = 1
                      , epochs = 10000
                      , debug = True
                      } :: Session
        w2v = def { docs = ds
                  , dimensions = 30
                  , method = SkipGram
                  , window = 2
                  , w2vDrawChart = True
                  , w2vChartName = "w2v.png"
                  } :: Word2Vec

    (computed, vocvec) <- word2vec w2v session

    return ()

  cleanText :: String -> String
  cleanText string = 
    let notag = unwords $ filter ((/= "<date>") . take 6) (words string)
        ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
        spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']
        nonumber = filter (not . isNumber) spacify
        lower = map toLower nonumber
    in unwords . words $ lower
    where
      trim = f . f
        where
          f = reverse . dropWhile isSpace
      replace needle replacement =
        map (\c -> if c == needle then replacement else c)

  removeWords :: [String] -> [String] -> [String]
  removeWords ws documents =
    map rm documents
    where
        rm text = 
          unwords $ filter (`notElem` ws) (words text)
relu: run notmnist using relu activation and draw the chart [wip] word2vec: work in progress implementation of word2vec 2016-09-13 05:19:44 +00:00			`{-# LANGUAGE RecordWildCards #-}`
			`{-# LANGUAGE FlexibleContexts #-}`
			`{-# LANGUAGE ScopedTypeVariables #-}`

			`module Main where`
feat(Numeric): move all modules to Numeric 2016-10-16 22:24:35 +00:00			`import Numeric.Sibe`
			`import Numeric.Sibe.Word2Vec`
			`import Numeric.Sibe.Utils`
relu: run notmnist using relu activation and draw the chart [wip] word2vec: work in progress implementation of word2vec 2016-09-13 05:19:44 +00:00			`import Data.Default.Class`
			`import qualified Data.Vector.Storable as V`
			`import Data.List (sortBy)`
			`import Data.Function (on)`
			`import Numeric.LinearAlgebra`
			`import System.IO`
feat(blogs-corpus): new corpus for word2vec 2016-09-19 11:30:45 +00:00			`import System.Directory`
relu: run notmnist using relu activation and draw the chart [wip] word2vec: work in progress implementation of word2vec 2016-09-13 05:19:44 +00:00			`import Data.List.Split`
feat(blogs-corpus): new corpus for word2vec 2016-09-19 11:30:45 +00:00			`import Control.Exception (evaluate)`
			`import Debug.Trace`
			`import Data.Char`
			`import System.Random`

			`rf :: FilePath -> IO String`
			`rf p = do`
			`hs <- openFile p ReadMode`
			`hSetEncoding hs latin1`
			`content <- evaluate =<< hGetContents hs`
			length content `seq` hClose hs
			`return content`
relu: run notmnist using relu activation and draw the chart [wip] word2vec: work in progress implementation of word2vec 2016-09-13 05:19:44 +00:00
			`main = do`
feat(blogs-corpus): new corpus for word2vec 2016-09-19 11:30:45 +00:00			`setStdGen (mkStdGen 100)`
rm(sin): remove sin example fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation 2016-09-16 09:01:23 +00:00			`sws <- lines <$> readFile "examples/stopwords"`
feat(blogs-corpus): new corpus for word2vec 2016-09-19 11:30:45 +00:00
			`-- real data, takes a lot of time to train`
feat(pca): implement PCA and visualize data using it 2016-10-11 12:58:09 +00:00			`{-ds <- do-}`
			`{-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}`
			`{-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}`
feat(blogs-corpus): new corpus for word2vec 2016-09-19 11:30:45 +00:00
feat(pca): implement PCA and visualize data using it 2016-10-11 12:58:09 +00:00			`{-let texts = map (unwords . splitOn " ") contents-}`
			`{-let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", " "]-}`
			`{-return $ map cleanText $ removeWords (sws ++ tags) texts-}`
feat(blogs-corpus): new corpus for word2vec 2016-09-19 11:30:45 +00:00
feat(pca): implement PCA and visualize data using it 2016-10-11 12:58:09 +00:00			`let ds = ["the king loves the queen", "the queen loves the king",`
			`"the dwarf hates the king", "the queen hates the dwarf",`
			`"the dwarf poisons the king", "the dwarf poisons the queen",`
			`"the man loves the woman", "the woman loves the man",`
			`"the thief hates the man", "the woman hates the thief",`
			`"the thief robs the man", "the thief robs the woman"]`
rm(sin): remove sin example fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation 2016-09-16 09:01:23 +00:00
feat(pca): implement PCA and visualize data using it 2016-10-11 12:58:09 +00:00			`let session = def { learningRate = 0.1`
fix(word2vec): simple example of word2vec 2016-09-16 09:33:15 +00:00			`, batchSize = 1`
feat(pca): implement PCA and visualize data using it 2016-10-11 12:58:09 +00:00			`, epochs = 10000`
fix(word2vec): simple example of word2vec 2016-09-16 09:33:15 +00:00			`, debug = True`
relu: run notmnist using relu activation and draw the chart [wip] word2vec: work in progress implementation of word2vec 2016-09-13 05:19:44 +00:00			`} :: Session`
rm(sin): remove sin example fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation 2016-09-16 09:01:23 +00:00			`w2v = def { docs = ds`
feat(pca): implement PCA and visualize data using it 2016-10-11 12:58:09 +00:00			`, dimensions = 30`
rm(sin): remove sin example fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation 2016-09-16 09:01:23 +00:00			`, method = SkipGram`
fix(word2vec): simple example of word2vec 2016-09-16 09:33:15 +00:00			`, window = 2`
feat(w2v): draw text charts for words 2016-10-01 08:54:36 +00:00			`, w2vDrawChart = True`
feat(pca): implement PCA and visualize data using it 2016-10-11 12:58:09 +00:00			`, w2vChartName = "w2v.png"`
rm(sin): remove sin example fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation 2016-09-16 09:01:23 +00:00			`} :: Word2Vec`

			`(computed, vocvec) <- word2vec w2v session`
relu: run notmnist using relu activation and draw the chart [wip] word2vec: work in progress implementation of word2vec 2016-09-13 05:19:44 +00:00
			`return ()`
rm(sin): remove sin example fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation 2016-09-16 09:01:23 +00:00
feat(blogs-corpus): new corpus for word2vec 2016-09-19 11:30:45 +00:00			`cleanText :: String -> String`
			`cleanText string =`
			`let notag = unwords $ filter ((/= "<date>") . take 6) (words string)`
			ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
			`spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']`
			`nonumber = filter (not . isNumber) spacify`
			`lower = map toLower nonumber`
			`in unwords . words $ lower`
			`where`
			`trim = f . f`
			`where`
			`f = reverse . dropWhile isSpace`
			`replace needle replacement =`
			`map (\c -> if c == needle then replacement else c)`

rm(sin): remove sin example fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation 2016-09-16 09:01:23 +00:00			`removeWords :: [String] -> [String] -> [String]`
			`removeWords ws documents =`
feat(blogs-corpus): new corpus for word2vec 2016-09-19 11:30:45 +00:00			`map rm documents`
rm(sin): remove sin example fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation 2016-09-16 09:01:23 +00:00			`where`
feat(blogs-corpus): new corpus for word2vec 2016-09-19 11:30:45 +00:00			`rm text =`
			unwords $ filter (`notElem` ws) (words text)
rm(sin): remove sin example fix(ignoreBiases): was ignoring nodes, lol fix(w2v): better logging and implementation 2016-09-16 09:01:23 +00:00