diff --git a/README.md b/README.md index c83ced5..741a071 100644 --- a/README.md +++ b/README.md @@ -87,11 +87,27 @@ the thief robs the man the thief robs the woman ``` -The computed vectors are transformed to two dimensions using SVD: +The computed vectors are transformed to two dimensions using PCA: `king` and `queen` have a relation with `man` and `woman`, `love` and `hate` are close to each other, and `dwarf` and `thief` have a relation with `poisons` and `robs`, also, `dwarf` is close to `queen` and `king` while `thief` is closer to `man` and `woman`. `the` doesn't relate to anything. ![word2vec results](https://raw.githubusercontent.com/mdibaiee/sibe/master/w2v.png) -This is a very small dataset and I have to test it on larger datasets. +_You can reproduce this result using these parameters:_ +```haskell +let session = def { learningRate = 0.1 + , batchSize = 1 + , epochs = 10000 + , debug = True + } :: Session + w2v = def { docs = ds + , dimensions = 30 + , method = SkipGram + , window = 2 + , w2vDrawChart = True + , w2vChartName = "w2v.png" + } :: Word2Vec +``` + +This is a very small development dataset and I have to test it on larger datasets. diff --git a/examples/word2vec.hs b/examples/word2vec.hs index b09e1ca..457d7b8 100644 --- a/examples/word2vec.hs +++ b/examples/word2vec.hs @@ -32,32 +32,32 @@ module Main where sws <- lines <$> readFile "examples/stopwords" -- real data, takes a lot of time to train - ds <- do - files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/" - contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files + {-ds <- do-} + {-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-} + {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-} - let texts = map (unwords . splitOn " ") contents - let tags = ["", "", "", "", "", "", " "] - return $ map cleanText $ removeWords (sws ++ tags) texts + {-let texts = map (unwords . splitOn " ") contents-} + {-let tags = ["", "", "", "", "", "", " "]-} + {-return $ map cleanText $ removeWords (sws ++ tags) texts-} - {-let ds = ["the king loves the queen", "the queen loves the king",-} - {-"the dwarf hates the king", "the queen hates the dwarf",-} - {-"the dwarf poisons the king", "the dwarf poisons the queen",-} - {-"the man loves the woman", "the woman loves the man",-} - {-"the thief hates the man", "the woman hates the thief",-} - {-"the thief robs the man", "the thief robs the woman"]-} + let ds = ["the king loves the queen", "the queen loves the king", + "the dwarf hates the king", "the queen hates the dwarf", + "the dwarf poisons the king", "the dwarf poisons the queen", + "the man loves the woman", "the woman loves the man", + "the thief hates the man", "the woman hates the thief", + "the thief robs the man", "the thief robs the woman"] - let session = def { learningRate = 5e-1 + let session = def { learningRate = 0.1 , batchSize = 1 - , epochs = 200 + , epochs = 10000 , debug = True } :: Session w2v = def { docs = ds - , dimensions = 300 + , dimensions = 30 , method = SkipGram , window = 2 , w2vDrawChart = True - , w2vChartName = "w2v-big-data.png" + , w2vChartName = "w2v.png" } :: Word2Vec (computed, vocvec) <- word2vec w2v session diff --git a/sibe.cabal b/sibe.cabal index 0f84197..f3bbc51 100644 --- a/sibe.cabal +++ b/sibe.cabal @@ -16,7 +16,6 @@ cabal-version: >=1.10 library hs-source-dirs: src exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils - ghc-options: -threaded -rtsopts -with-rtsopts=-N build-depends: base >= 4.7 && < 5 , hmatrix , random diff --git a/src/Sibe/Utils.hs b/src/Sibe/Utils.hs index 72108a6..d90cc0b 100644 --- a/src/Sibe/Utils.hs +++ b/src/Sibe/Utils.hs @@ -3,6 +3,7 @@ module Sibe.Utils , ordNub , onehot , average + , pca ) where import qualified Data.Vector.Storable as V import qualified Data.Set as Set @@ -26,3 +27,15 @@ module Sibe.Utils average :: Vector Double -> Vector Double average v = cmap (/ (V.sum v)) v + + pca :: Matrix Double -> Int -> Matrix Double + pca m d = + let rs = toRows m + means = map (\v -> V.sum v / fromIntegral (V.length v)) rs + meanReduced = map (\(a, b) -> V.map (+ (negate b)) a) $ zip rs means + mat = fromRows meanReduced + + (u, s, v) = svd mat + diagS = diagRect 0 s (rows mat) (cols mat) + + in u ?? (All, Take d) <> diagS ?? (Take d, Take d) diff --git a/src/Sibe/Word2Vec.hs b/src/Sibe/Word2Vec.hs index 7c52669..5b7586a 100644 --- a/src/Sibe/Word2Vec.hs +++ b/src/Sibe/Word2Vec.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE ScopedTypeVariables #-} + module Sibe.Word2Vec ( word2vec , Word2Vec (..) @@ -9,7 +11,7 @@ module Sibe.Word2Vec import Data.Char import Data.Maybe import Data.List - import Numeric.LinearAlgebra hiding (find) + import Numeric.LinearAlgebra as H hiding (find) import qualified Data.Vector.Storable as V import Data.Default.Class import Data.Function (on) @@ -59,21 +61,17 @@ module Sibe.Word2Vec let computedVocVec = map (\(w, v) -> (w, runLayer' v hidden)) vocvec when (w2vDrawChart w2v) $ do - let mat = fromColumns . map snd $ computedVocVec - (u, s, v) = svd mat - cut = subMatrix (0, 0) (2, cols mat) - diagS = diagRect 0 (V.take 2 s) (rows mat) (cols mat) - - twoDimensions = cut $ u <> diagS <> tr v - textData = zipWith (\s l -> (V.head l, V.last l, s)) (map fst computedVocVec) (toColumns twoDimensions) + let m = fromRows . map snd $ computedVocVec + twoDimensions = pca m 2 + textData = zipWith (\s l -> (V.head l, V.last l, s)) (map fst computedVocVec) (toRows twoDimensions) chart = toRenderable layout where - textP = plot_annotation_values .~ textData + textP = plot_annotation_values .~ textData $ def layout = layout_title .~ "word vectors" - $ layout_plots .~ [toPlot textP] - $ def + $ layout_plots .~ [toPlot textP] + $ def renderableToFile def (w2vChartName w2v) chart return () diff --git a/w2v.png b/w2v.png index 0183921..7eecb27 100644 Binary files a/w2v.png and b/w2v.png differ