{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ScopedTypeVariables #-} module Main where import Sibe import Sibe.Word2Vec import Sibe.Utils import Data.Default.Class import qualified Data.Vector.Storable as V import Data.List (sortBy) import Data.Function (on) import Numeric.LinearAlgebra import System.IO import System.Directory import Data.List.Split import Control.Exception (evaluate) import Debug.Trace import Data.Char import System.Random rf :: FilePath -> IO String rf p = do hs <- openFile p ReadMode hSetEncoding hs latin1 content <- evaluate =<< hGetContents hs length content `seq` hClose hs return content main = do setStdGen (mkStdGen 100) sws <- lines <$> readFile "examples/stopwords" -- real data, takes a lot of time to train {-ds <- do-} {-files <- filter ((/= "xml") . take 3 . reverse) <$> listDirectory "examples/blogs-corpus/"-} {-contents <- mapM (rf . ("examples/blogs-corpus/" ++)) files-} {-let texts = map (unwords . splitOn " ") contents-} {-let tags = ["", "", "", "", "", "", " "]-} {-return $ map cleanText $ removeWords (sws ++ tags) texts-} let ds = ["the king loves the queen", "the queen loves the king", "the dwarf hates the king", "the queen hates the dwarf", "the dwarf poisons the king", "the dwarf poisons the queen"] let session = def { learningRate = 1e-1 , batchSize = 1 , epochs = 200 , debug = True } :: Session w2v = def { docs = ds , dimensions = 25 , method = SkipGram , window = 2 } :: Word2Vec (computed, vocvec) <- word2vec w2v session mapM_ (\(w, v) -> do putStr $ w ++ ": " let similarities = map (similarity v . snd) computed let sorted = sortBy (compare `on` similarity v . snd) computed print . take 2 . drop 1 . reverse $ map fst sorted ) computed return () cleanText :: String -> String cleanText string = let notag = unwords $ filter ((/= "") . take 6) (words string) ws = unwords $ filter (`notElem` ["urlLink"]) (words notag) spacify = foldl (\acc x -> replace x ' ' acc) (trim ws) [',', '/', '-', '\n', '\r', '?', '.', '(', ')', '%', '$', '"', ';', ':', '!', '\''] nonumber = filter (not . isNumber) spacify lower = map toLower nonumber in unwords . words $ lower where trim = f . f where f = reverse . dropWhile isSpace replace needle replacement = map (\c -> if c == needle then replacement else c) removeWords :: [String] -> [String] -> [String] removeWords ws documents = map rm documents where rm text = unwords $ filter (`notElem` ws) (words text)