feat(pca): implement PCA and visualize data using it

This commit is contained in:
Mahdi Dibaiee 2016-10-11 16:28:09 +03:30
parent 7f90afba7f
commit 6da6e4fd09
6 changed files with 56 additions and 30 deletions

View File

@ -87,11 +87,27 @@ the thief robs the man
the thief robs the woman the thief robs the woman
``` ```
The computed vectors are transformed to two dimensions using SVD: The computed vectors are transformed to two dimensions using PCA:
`king` and `queen` have a relation with `man` and `woman`, `love` and `hate` are close to each other, `king` and `queen` have a relation with `man` and `woman`, `love` and `hate` are close to each other,
and `dwarf` and `thief` have a relation with `poisons` and `robs`, also, `dwarf` is close to `queen` and `king` while and `dwarf` and `thief` have a relation with `poisons` and `robs`, also, `dwarf` is close to `queen` and `king` while
`thief` is closer to `man` and `woman`. `the` doesn't relate to anything. `thief` is closer to `man` and `woman`. `the` doesn't relate to anything.
![word2vec results](https://raw.githubusercontent.com/mdibaiee/sibe/master/w2v.png) ![word2vec results](https://raw.githubusercontent.com/mdibaiee/sibe/master/w2v.png)
This is a very small dataset and I have to test it on larger datasets. _You can reproduce this result using these parameters:_
```haskell
let session = def { learningRate = 0.1
, batchSize = 1
, epochs = 10000
, debug = True
} :: Session
w2v = def { docs = ds
, dimensions = 30
, method = SkipGram
, window = 2
, w2vDrawChart = True
, w2vChartName = "w2v.png"
} :: Word2Vec
```
This is a very small development dataset and I have to test it on larger datasets.

View File

@ -32,32 +32,32 @@ module Main where
sws <- lines <$> readFile "examples/stopwords" sws <- lines <$> readFile "examples/stopwords"
-- real data, takes a lot of time to train -- real data, takes a lot of time to train
ds <- do {-ds <- do-}
files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/" {-files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"-}
contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-}
let texts = map (unwords . splitOn "&nbsp;") contents {-let texts = map (unwords . splitOn "&nbsp;") contents-}
let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", "&nbsp;"] {-let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", "&nbsp;"]-}
return $ map cleanText $ removeWords (sws ++ tags) texts {-return $ map cleanText $ removeWords (sws ++ tags) texts-}
{-let ds = ["the king loves the queen", "the queen loves the king",-} let ds = ["the king loves the queen", "the queen loves the king",
{-"the dwarf hates the king", "the queen hates the dwarf",-} "the dwarf hates the king", "the queen hates the dwarf",
{-"the dwarf poisons the king", "the dwarf poisons the queen",-} "the dwarf poisons the king", "the dwarf poisons the queen",
{-"the man loves the woman", "the woman loves the man",-} "the man loves the woman", "the woman loves the man",
{-"the thief hates the man", "the woman hates the thief",-} "the thief hates the man", "the woman hates the thief",
{-"the thief robs the man", "the thief robs the woman"]-} "the thief robs the man", "the thief robs the woman"]
let session = def { learningRate = 5e-1 let session = def { learningRate = 0.1
, batchSize = 1 , batchSize = 1
, epochs = 200 , epochs = 10000
, debug = True , debug = True
} :: Session } :: Session
w2v = def { docs = ds w2v = def { docs = ds
, dimensions = 300 , dimensions = 30
, method = SkipGram , method = SkipGram
, window = 2 , window = 2
, w2vDrawChart = True , w2vDrawChart = True
, w2vChartName = "w2v-big-data.png" , w2vChartName = "w2v.png"
} :: Word2Vec } :: Word2Vec
(computed, vocvec) <- word2vec w2v session (computed, vocvec) <- word2vec w2v session

View File

@ -16,7 +16,6 @@ cabal-version: >=1.10
library library
hs-source-dirs: src hs-source-dirs: src
exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils exposed-modules: Sibe, Sibe.NaiveBayes, Sibe.NLP, Sibe.Word2Vec, Sibe.Utils
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, hmatrix , hmatrix
, random , random

View File

@ -3,6 +3,7 @@ module Sibe.Utils
, ordNub , ordNub
, onehot , onehot
, average , average
, pca
) where ) where
import qualified Data.Vector.Storable as V import qualified Data.Vector.Storable as V
import qualified Data.Set as Set import qualified Data.Set as Set
@ -26,3 +27,15 @@ module Sibe.Utils
average :: Vector Double -> Vector Double average :: Vector Double -> Vector Double
average v = cmap (/ (V.sum v)) v average v = cmap (/ (V.sum v)) v
pca :: Matrix Double -> Int -> Matrix Double
pca m d =
let rs = toRows m
means = map (\v -> V.sum v / fromIntegral (V.length v)) rs
meanReduced = map (\(a, b) -> V.map (+ (negate b)) a) $ zip rs means
mat = fromRows meanReduced
(u, s, v) = svd mat
diagS = diagRect 0 s (rows mat) (cols mat)
in u ?? (All, Take d) <> diagS ?? (Take d, Take d)

View File

@ -1,3 +1,5 @@
{-# LANGUAGE ScopedTypeVariables #-}
module Sibe.Word2Vec module Sibe.Word2Vec
( word2vec ( word2vec
, Word2Vec (..) , Word2Vec (..)
@ -9,7 +11,7 @@ module Sibe.Word2Vec
import Data.Char import Data.Char
import Data.Maybe import Data.Maybe
import Data.List import Data.List
import Numeric.LinearAlgebra hiding (find) import Numeric.LinearAlgebra as H hiding (find)
import qualified Data.Vector.Storable as V import qualified Data.Vector.Storable as V
import Data.Default.Class import Data.Default.Class
import Data.Function (on) import Data.Function (on)
@ -59,13 +61,9 @@ module Sibe.Word2Vec
let computedVocVec = map (\(w, v) -> (w, runLayer' v hidden)) vocvec let computedVocVec = map (\(w, v) -> (w, runLayer' v hidden)) vocvec
when (w2vDrawChart w2v) $ do when (w2vDrawChart w2v) $ do
let mat = fromColumns . map snd $ computedVocVec let m = fromRows . map snd $ computedVocVec
(u, s, v) = svd mat twoDimensions = pca m 2
cut = subMatrix (0, 0) (2, cols mat) textData = zipWith (\s l -> (V.head l, V.last l, s)) (map fst computedVocVec) (toRows twoDimensions)
diagS = diagRect 0 (V.take 2 s) (rows mat) (cols mat)
twoDimensions = cut $ u <> diagS <> tr v
textData = zipWith (\s l -> (V.head l, V.last l, s)) (map fst computedVocVec) (toColumns twoDimensions)
chart = toRenderable layout chart = toRenderable layout
where where

BIN
w2v.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 14 KiB