sibe/examples/word2vec.hs

89 lines
3.0 KiB
Haskell
Raw Normal View History

{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Main where
import Sibe
import Sibe.Word2Vec
import Sibe.Utils
import Data.Default.Class
import qualified Data.Vector.Storable as V
import Data.List (sortBy)
import Data.Function (on)
import Numeric.LinearAlgebra
import System.IO
import System.Directory
import Data.List.Split
import Control.Exception (evaluate)
import Debug.Trace
import Data.Char
import System.Random
rf :: FilePath -> IO String
rf p = do
hs <- openFile p ReadMode
hSetEncoding hs latin1
content <- evaluate =<< hGetContents hs
length content `seq` hClose hs
return content
main = do
setStdGen (mkStdGen 100)
sws <- lines <$> readFile "examples/stopwords"
-- real data, takes a lot of time to train
2016-10-03 15:52:55 +00:00
ds <- do
files <- filter ((/= "xml") . take 1 . reverse) <$> listDirectory "examples/blogs-corpus/"
contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files
2016-10-03 15:52:55 +00:00
let texts = map (unwords . splitOn "&nbsp;") contents
let tags = ["<Blog>", "</Blog>", "<date>", "</date>", "<post>", "</post>", "&nbsp;"]
return $ map cleanText $ removeWords (sws ++ tags) texts
2016-10-03 15:52:55 +00:00
{-let ds = ["the king loves the queen", "the queen loves the king",-}
{-"the dwarf hates the king", "the queen hates the dwarf",-}
{-"the dwarf poisons the king", "the dwarf poisons the queen",-}
{-"the man loves the woman", "the woman loves the man",-}
{-"the thief hates the man", "the woman hates the thief",-}
{-"the thief robs the man", "the thief robs the woman"]-}
2016-10-01 08:54:36 +00:00
let session = def { learningRate = 5e-1
, batchSize = 1
2016-10-03 15:52:55 +00:00
, epochs = 200
, debug = True
} :: Session
w2v = def { docs = ds
2016-10-03 15:52:55 +00:00
, dimensions = 300
, method = SkipGram
, window = 2
2016-10-01 08:54:36 +00:00
, w2vDrawChart = True
2016-10-03 15:52:55 +00:00
, w2vChartName = "w2v-big-data.png"
} :: Word2Vec
(computed, vocvec) <- word2vec w2v session
return ()
cleanText :: String -> String
cleanText string =
let notag = unwords $ filter ((/= "<date>") . take 6) (words string)
ws = unwords $ filter (`notElem` ["urlLink"]) (words notag)
spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\'']
nonumber = filter (not . isNumber) spacify
lower = map toLower nonumber
in unwords . words $ lower
where
trim = f . f
where
f = reverse . dropWhile isSpace
replace needle replacement =
map (\c -> if c == needle then replacement else c)
removeWords :: [String] -> [String] -> [String]
removeWords ws documents =
map rm documents
where
rm text =
unwords $ filter (`notElem` ws) (words text)