From e3e3fecf4da260a43dc13dcd7688ed452e9d233a Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Sun, 31 Mar 2019 09:52:00 +0430 Subject: [PATCH] refactor: working version with command-line utilities --- .gitignore | 2 +- INSTALL.md | 2 + checkpoints/a.hdf5 | Bin 14416 -> 0 bytes checkpoints/b.hdf5 | Bin 17296 -> 0 bytes constants.py | 63 +++++++++++++++++++ demo.py | 19 ------ draw.py | 52 ++++++---------- floyd.yml | 23 ------- model.py | 144 +++++++++++++++++++++++++++++++++++++++++++ nn.py | 149 --------------------------------------------- plot.py | 28 --------- predict.py | 41 +++++++------ requirements.txt | 4 ++ tracks | 89 --------------------------- train.py | 67 ++++++++++++++++++++ utils.py | 81 ++++++++++++------------ 16 files changed, 361 insertions(+), 403 deletions(-) delete mode 100644 checkpoints/a.hdf5 delete mode 100644 checkpoints/b.hdf5 delete mode 100644 demo.py delete mode 100644 floyd.yml create mode 100644 model.py delete mode 100644 nn.py delete mode 100644 plot.py delete mode 100644 tracks create mode 100644 train.py diff --git a/.gitignore b/.gitignore index b0b8f6b..7962fef 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ maps logs -checkpoints.* +checkpoints geodata *.p #### joe made this: http://goel.io/joe diff --git a/INSTALL.md b/INSTALL.md index aa1ceb6..bc01bcd 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -2,5 +2,7 @@ pyenv install $(cat .python-version) pyenv local pip install -r requirements.txt +apt install proj-bin libproj-dev # https://proj4.org/install.html#install +apt install libgeos-3.6.2 libgeos-dev libgeos++-dev # https://packages.ubuntu.com/search?keywords=geos&searchon=sourcenames&suite=all§ion=all ``` diff --git a/checkpoints/a.hdf5 b/checkpoints/a.hdf5 deleted file mode 100644 index f016d673004ee8d10bcd1606100e79e20a1d025c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 14416 zcmeHNO>7%Q6rOd`V$uXPg{tWf)G;8cD$QSAutMSl(GVk{Hl%=}rp4VPn|SGF-R!Q@ zCRIvq6miJ~=m~+iAS9&R;LzBrxP&T#fGU23RJ8}hg+s)l%>fB!cHXnuvD0nyLr^>; zd1q(l?VI<$dGGDaTfZAW`chTJo(jTBFi6VDcJ~%9e?#ex@*ZsC_!+oSpuPnW4=@1_$o zPhvkV1Gik1$8CWzv%X5w0sMk4zkYS#tY!}S@^Kv(_%JB)a#gu}I35Ejb4VYi!@m52 zOex^UUDO*77!OtyO2p*GbwIyf+UgFI1zr(9zmD(YgvDn8CbHVJ2rTEyfgCjMRC~4_@cQjUTk@VtBw%ZQDgI+!6|(&_>gm>FBBvt_y{E zJfi&fHRHMsCSt;jrQy)rMBZ4}%rVY|ZxM+o9O-?dp z=#)NFoYbt8EwC=o(Pah_TiS5iP%V%crgtYQI^yD?JL8F>qrj7{we7Bjlh?>)SunsP z5mEAA>(B*#`Dx>#arrC904((idM9^*GmtHvRocD?j3&M7LOwV{B%I(Sdx?$ zj3YCUw_E@@P_F4iUb?TK{c{8FFSRS?rA53d>6b4SniW#z=B4=;u6$ye<)s;>K6&W| zo|J28g!OM=nYfllhBB@QyU!Q=ev1pKkOS_Z65uO~`5Vmhi0v|(LuZuLq5ZD>goMbh zAldacOtA&NBLxDSAzrIUbtSp z`}tHCYI*G!r=w4DIvJ-q4yAS#<8%lBxB~Wrc%Z*x8~v5pAS+Q`-S<`8?=^zmB0tfN z@0)Rn5#sv}6f$f99h$=ZA+|fB)-RX7tndvVUYdXJ5U3yX|Jz z=xoo|<(W&T0@>(Wzg?|%c67{4RCFjU-?vXZUmgp*QJ?v9Xg;&|^LH{IUfP%ar0nkO z!R9-$FFSj(-_Bpj{CJ};`|D5d#}?+kh~56~TIC9TlEq@mz`Z%FmMvaGvy@a|_Rt z7d8?OdbTY7Kdx@~j84cjNauIHoQZHk+;?uMQgZVD90BosiB`WP8-DR+ z=?JHn@nyYN?ElGMj4vS&vlr|KrC#s+)nXH$3G-K?9itqw_97tWuR5Odj*Bhy^INbF g5ywll?YDo$_X^f4f1i-2ImWmWyjA!<+1UN-KWbmm8vpB6vi>16dKgML;x&Y@U56^s%QPYwwxc#8)K zcwb;ZVW%1;;(-Uh5=G+?MNy-PCPuuXZ-$vMVb#yX_x}9vJ?ZabYP!0*x~itCX1aPm zl3ZPeHgDR#DZ@8?eMX0AQFkj{UWzTC?Y0^L)Jk*8!@&R#$F=)i%^Pde?Ex0mLRTYz<7vp zXux^62@|H65ftqgp$HG+vkAkr=A6DU2eq_>NETmXj08nFMKL8Q|7YTx7gdTg*LJ)# z3^PRludZ60^lysGbW?#O;T5HDGt7Ckdz~g?m8REHYN$Bk5n4KRDD(^MEi#OiZ2S%pGLroS5dHVDV3J$+<13C+Aj5PW?Z#d~M*p{J_tKSMYCo zf{h(Fr@kk|L&VX25%6Y|BX-eAM^&F_;?{timw~yZF!Xc{NkJRJtXB{0n2^= zN)shS18lHBg9REa&|rZE3p7}u!2%5yXt2QFuz=^N(Joqb)jVO6z4(DD)d4i?BY%VS z2Sxfv-Sxj;4;Jd;Ev9+hxsGQ1 z@aH;){J40oBb|3huujgtLLJz_TIlihLkBTm2`9a;i!1&9K~hkQ>0U>q)kw%>&BQ4( z5o-3exmf2ZTguGpofrUe8$HhPr@XkSf=`h6+R{X5sk zf9O5^vi5Mtp_tfZ-jR3NdlWV^cG$$F1sa~e7q7m36Dq>vVY6~Lx-=*qyBJNQottBD zoPP$pUs%9?l~n@!(=2J*vu*KW?`|X`q$5=|&cUNA44{2EL%S(E(SpM!L|>M|&S~w4 z#)RBvF=`B3F8G5?{T<894lizbH;4@k(IW;M?-qORwUeh>4`##ThvS(JXV~+(N6?DI zpJ8%E0XydOQF+0cDfsvWQ!wam0gn#b$OoQJh0mw=$Af40#`7KwQ8U(eSe?V0(6#36 z)i(-e;;0~Bc}dH{Vy)wCadxpBtuMPKKjnNH;Vo0xGkrE9kF{ZVY5T_X_V`@+p^!b~ zmP;^9{KX%aO#Fm3++K{b4m#7wrNh`r^BZV#Qb*$8c1!+6rzZH+**P$H(*;mn@`Sm+ zeu>avbDZ5|A32;;NPqIK#66}|As|GY9eZ~x=FE*U0?LAeN*akV;(%|t?T&djP2~|pC01a7nk81y%h9)#526g zIFq&9Q36Y9m*QVvZe$nlTa6mWx5D0CE*@BZB^9=|0?79xjz~ce5~Xs<4E$R^Q{&$WxG`y%5_t?UNgtTxLU`FJKFo>LO;6_8{gr zEy&yC1CPR{(40P}im9>&t&v|>d#;Jc&1|lsD{a=GJ_#!JUZ^oHU6+m~IF5yT8@s{i zuRTz1`3TlO&{h#B7yVokWVj2K{_-(CF}gV#Z$AVl->d|EZ;XG+HYVj?pJjJk??GkDZID-{Gg$t7 z0hPB3gH3L2@r%%DM0f5HTxyigdW?Mlcd{Ph@%Lh2e9>Y&wzQ&n>c5V|QtveMpi5iY zFTj|V{B!{iFS!iQOGn@)PG6w?qsAh0CRklHtD9VYGFv?`W1jk9Mj^Y%_yNi*+^BB< zG!N$*wvzAk8H>-(a>L_Ni;J|ErLY@Tb;fJ1?L~^Li!ecF8(fZZ!su|LUtWaYW}RS)yB=vYS*9yXRR(QDu5RzX-GG= zG1+ir73;9tl;#hN#7#SW4%$ltSna9-c%oV#XPoXpbQfC@!|@1IT61Cg$sbVG?Rca! z@Hx8pOD0=-YXfxs^cKPu?coOs_uUc0@h_}pV}N7a*i$fdI@ z8~$T&6o2@vx=42dtM&Ulw8-)ce0KGEHm_3#9=&M2oawe6<@k+J2Sj#fb4NW<$DRC! zwK1MTs~4qW^J+t||1uPM99W@t&t2+hsn}lh!@g7O)@ob!`h`T8+;0!M+cS^tsf@#& zIvj=P(;t!(SX)P)?f}oM6XF;Nruu&>^*cG#rK6lR|7oi=s>2QvIvY-{&f2nCnTCG+|_GV|=$g+kW_}ZZ4 zre3d-yUiU+Op^PQ#8|6JW`9viKlJzJ{}=sV-(LzlD`_7q6BU)<`|l@7AAwbZ$WZ{V z?fy5)|dEU%~RT=_~Dr-?x4x6ZPb4XMB^@U4>d2fBXLV zCc!~uC0720-s|~hib(d)fVb7Jy!#95$BB94UG*#bfdYL)e)(sf+yAyzH1ms06mX^J zKa#xv?)sIr=(ke+O4E-eIT~Pn3uxA_5(d{F*X&7gF0<1d`LO#AQT?b>qKw`ekVzkh zjiXO2L#capBuzislVYb7x~QZLJ-+!R3_oZ~4SdsRUZ452bLB<&xM(jdj*Wq)JCmV$ zm@d^@@h#X4_kvavJ;Bym3Fw|V6uq#ALuia+2c5NWtV1s-+q4UgJL!-YPg{_|HOA!d zrTIi>pbuHsx{`z~nTcCCOeL)?ZOMh$qxkB%M`ZJ|?{Qi9LmWKRjI>#^lpWxagTAs= zLuXGd>Q?&=^oy*P@0nnRE{uB#CN{Y!-S-$&4Nt=@n=F7I@dcg;^= zb#l@nC*dJ3w0e%Xau9gX0x()PjRw2w(t&Aq^m^!Y`gC|1jPfy|nGyDo(q$WZ8PFYn z7NSk^6pwMUedk$^$^$5F;{v?ga3-i-o?)L|o9K(PYPeGviFXV#BikP6;EHvYG(OUc z?tRdU7F`X8jhA;qr&3=q_31)Kb~1vIy*kq3E|xUM^C&!+o`) z_=Xk52Lo-$pwjc$J#HB`>g9-Imf4fvCZ5KtRvp13yO!Zo?es{ptSQ;>QEOtEVuUR^ zt}f2Joy>0b=trJxNJ4$@F2SYq`?2Q^F2sG#nUl4uo4BIs1!VENx4Qe<6?m?}Aw0V8 zXSj3|;Jw}taGPT<@Z7d1vEiy)*m6!jc1@efHaFOcRXeuh7mbhLJ-M~Ga&{5c9kLAv z_FT!n^!o-X{>4Jx_0v%N%oavow4v-;3}se&wD`<&s9md1r*G>>lguhXvDuiO-n$EK zMm~jr9Y%)WG;{U~bP&Wms!<4yDT$AA?WF z+=21~$>fKb%ShtHBodn8PV$mBkWYrJAfr7tktgo?Bpc_DF7a8!df^JvnaLy@E!Pu+ zcBw?$CIBaIu#tZq><@DuUx(nT0+?BuLA_dSK^3Jf=|(SI8XtELo))a8m6rqPP+&p( zhmWbk$d?=|8AndmP9t{taYV0dKAC?oif9{8CgbNMkgGcb3HvCPoIg39v|p(r1&8O7 zknJPL;m(F^n!PiA=4XR%Wh}%aXBpsq`PUqMT6RHNOMByofsT0Ux$R{1S`VBh(JzLQd*7eoHRxYLZe4j2PZpkXV-MJY#-f1@UW^BkA z#aetdI2|vWbszV)cnSxO?S>u04C(g^FQUCZKY{H>o8_)qmXxK_=;}5G^wCFd$lcos zW-80s`PFTx?Pw!#yjg_r8s5gEdi{c%IP}MD69bTY%?ehvU?si~cLMu&t3){kS;(Zv z64>GT3A&Cy!dYG|aO;t+@#4NKQGv%wnC0(R*n68>#WQs|z7CCxe}Sb0@BfG9?TSByNs9$-Ssec+vPO{QRT~2{dm;wtSgP zPNQ_PYQyI2R3dPsY?*UWm12B0b zgEMNrM{zro;OdoraHgpj{w`<>Enl>X&Nl$+?!1u3_!Lp2)<)Fp^K9zb{516(x`eiR zw2#iPw19lW_0%x@481dTA?^HY6QZmhND^0RlS{QF(9d5D&3Ad=u+*vKS)vnFW%VKZ zQ*_8cnu(7lrN9qgXp=!UCghUS9B?lFR=#;sK907S=NLWc8M4Yuz$3|JH0nfmJZx4Q z_$Im-eOfsM-@Z+7N!mg5sgFJSu;2dt?=gh`4^j7{0L~a%;)$TNkCpb75`6#tBo*sd P^-p|X-%tK$>sS8;e5Zf{ diff --git a/constants.py b/constants.py index 7cd3b38..24f6e8e 100644 --- a/constants.py +++ b/constants.py @@ -16,3 +16,66 @@ WINTER_MONTHS = ['december', 'january', 'february'] SPRING_MONTHS = ['march', 'april', 'may'] SUMMER_MONTHS = ['june', 'july', 'august'] AUTUMN_MONTHS = ['september', 'november', 'october'] + +INPUTS = ['elevation', 'distance_to_water', 'latitude'] +OUTPUT = 'biome_num' + +BIOMES = [ + { + 'name': 'Tropical & Subtropical Moist Broadleaf Forests', + 'color': '#016936', + }, + { + 'name': 'Tropical & Subtropical Dry Broadleaf Forests', + 'color': '#B2D127', + }, + { + 'name': 'Tropical & Subtropical Coniferous Forests', + 'color': '#77CC00', + }, + { + 'name': 'Temperate Broadleaf & Mixed Forests', + 'color': '#99C500', + }, + { + 'name': 'Temperate Conifer Forests', + 'color': '#B6CC00', + }, + { + 'name': 'Boreal Forests/Taiga', + 'color': '#00C5B5', + }, + { + 'name': 'Tropical & Subtropical Grasslands, Savannas & Shrublands', + 'color': '#EFFF00', + }, + { + 'name': 'Temperate Grasslands, Savannas & Shrublands', + 'color': '#FFEE00', + }, + { + 'name': 'Flooded Grasslands & Savannas', + 'color': '#009BFF', + }, + { + 'name': 'Montane Grasslands & Shrublands', + 'color': '#A0ADBA', + }, + { + 'name': 'Tundra', + 'color': '#5C62FF', + }, + { + 'name': 'Mediterranean Forests, Woodlands & Scrub', + 'color': '#00850F', + }, + { + 'name': 'Deserts & Xeric Shrublands', + 'color': '#FF9E1F', + }, + { + 'name': 'Mangroves', + 'color': '#FF1F97' + } +] + diff --git a/demo.py b/demo.py deleted file mode 100644 index 2750630..0000000 --- a/demo.py +++ /dev/null @@ -1,19 +0,0 @@ -import pandas as pd -from utils import * - -df = pd.read_pickle('data_final.p') -df.to_csv('data_final.csv') - -print('DataFrame:') -print(df) - -dataset_size, features, output_size, _ = dataframe_to_dataset_biomes(df) -print('Biomes dataset:\n - size: {}\n - inputs: {}\n - outputs: {}\n'.format(dataset_size, features, output_size)) - -dataset_size, features, output_size, _ = dataframe_to_dataset_temp_precip(df) -print('Temp/Precip dataset:\n - size: {}\n - inputs: {}\n - outputs: {}\n'.format(dataset_size, features, output_size)) - -# print('Normalized Data:') -# print(normalize_df(df)) - -# normalize_df(df).to_csv('data_normalized.csv') diff --git a/draw.py b/draw.py index 20a4d79..d58585b 100644 --- a/draw.py +++ b/draw.py @@ -1,59 +1,43 @@ -from shapely.geometry import Point, MultiPoint -from shapely.ops import cascaded_union +import fire import matplotlib.pyplot as plt +from utils import logger +from constants import BIOMES import pandas as pd import cartopy.crs as ccrs def draw(df, path=None): + logger.debug('draw(df, %s)', path) biomes = {} biome_numbers = df['biome_num'].unique() - # biome_names = df['biome_name'].unique() for i, row in df.iterrows(): - p = Point(row.longitude, row.latitude) if row.biome_num in biomes: - biomes[row.biome_num].append(p) + biomes[row.biome_num]['x'].append(row.longitude) + biomes[row.biome_num]['y'].append(row.latitude) else: - biomes[row.biome_num] = [p] + biomes[row.biome_num] = { 'x': [row.longitude], 'y': [row.latitude] } ax = plt.axes(projection=ccrs.PlateCarree()) ax.stock_img() - # ax.legend(df['biome_name'].unique()) - colors={ - 0: '#016936', - 1: '#B2D127', - 2: '#77CC00', - 3: '#99C500', - 4: '#B6CC00', - 5: '#00C5B5', - 6: '#EFFF00', - 7: '#FFEE00', - 8: '#009BFF', - 9: '#A0ADBA', - 10: '#5C62FF', - 11: '#00850F', - 12: '#FF9E1F', - 13: '#FF1F97' - } for n in biome_numbers: - biomes[n] = MultiPoint(biomes[n]).buffer(0.5) - # print(biomes[n]) - # legend = biome_names[n] - if not hasattr(biomes[n], '__iter__'): - biomes[n] = [biomes[n]] - ax.add_geometries(biomes[n], ccrs.PlateCarree(), facecolor=colors[n]) - # artist.set_label(biome_names[n]) - # print(artist.get_label()) + xs = biomes[n]['x'] + ys = biomes[n]['y'] + scatter = ax.scatter(xs, ys, s=4, c=BIOMES[n]['color'], transform=ccrs.PlateCarree()) + scatter.set_label(BIOMES[n]['name']) - # ax.legend(artists, biome_names) + ax.legend() + figure = plt.gcf() + figure.set_size_inches(20, 18) if path: plt.savefig(path) else: plt.show() +def draw_cmd(path=None): + draw(pd.read_pickle('data.p'), path=path) + if __name__ == "__main__": - df = pd.read_pickle('data.p') - draw(df) + fire.Fire(draw_cmd) diff --git a/floyd.yml b/floyd.yml deleted file mode 100644 index 4c5c966..0000000 --- a/floyd.yml +++ /dev/null @@ -1,23 +0,0 @@ -# see: https://docs.floydhub.com/floyd_config -# All supported configs: -# -#machine: cpu -#env: tensorflow-1.8 -#input: -# - destination: input -# source: foo/datasets/yelp-food/1 -# - foo/datasets/yelp-food-test/1:test -#description: this is a test -#max_runtime: 3600 -#command: python train.py - -# You can also define multiple tasks to use with --task argument: -# -#task: -# evaluate: -# machine: gpu -# command: python evaluate.py -# -# serve: -# machine: cpu -# mode: serve diff --git a/model.py b/model.py new file mode 100644 index 0000000..c625987 --- /dev/null +++ b/model.py @@ -0,0 +1,144 @@ +from __future__ import absolute_import, division, print_function + +# TensorFlow and tf.keras +import tensorflow as tf +from tensorflow import keras + +# Helper libraries +import numpy as np +import pandas as pd + +from utils import * + +RANDOM_SEED = 1 + +logger.debug('Tensorflow version: %s', tf.__version__) +logger.debug('Random Seed: %s', RANDOM_SEED) + +tf.set_random_seed(RANDOM_SEED) +np.random.seed(RANDOM_SEED) + +DEFAULT_BATCH_SIZE=256 +DEFAULT_LAYERS = [512, 512] +DEFAULT_BUFFER_SIZE=500 +DEFAULT_OUT_ACTIVATION = tf.nn.softmax +DEFAULT_LOSS = 'sparse_categorical_crossentropy' +DEFAULT_OPTIMIZER = tf.keras.optimizers.Adam(lr=0.001) + +class Model(): + def __init__(self, name, epochs=1): + self.name = name + self.path = "checkpoints/{}.hdf5".format(name) + + self.epochs = epochs + + def prepare_dataset(self, df, fn, **kwargs): + self.dataset_fn = fn + + self.set_dataset(*fn(df), **kwargs) + + def set_dataset(self, dataset_size, features, output_size, class_weight, dataset, shuffle_buffer_size=DEFAULT_BUFFER_SIZE, batch_size=DEFAULT_BATCH_SIZE): + self.shuffle_buffer_size = shuffle_buffer_size + + self.class_weight = class_weight + self.dataset = dataset.shuffle(self.shuffle_buffer_size) + self.TRAIN_SIZE = int(dataset_size * 0.85) + self.TEST_SIZE = dataset_size - self.TRAIN_SIZE + (training, test) = (self.dataset.take(self.TRAIN_SIZE), + self.dataset.skip(self.TRAIN_SIZE)) + + logger.debug('Model dataset info: size=%s, train=%s, test=%s', dataset_size, self.TRAIN_SIZE, self.TEST_SIZE) + + self.dataset_size = dataset_size + self.features = features + self.output_size = output_size + self.training = training + self.test = test + + logger.debug('Model input size: %s', self.features) + logger.debug('Model output size: %s', self.output_size) + + self.batch_size = batch_size + self.training_batched = self.training.batch(self.batch_size).repeat() + self.test_batched = self.test.batch(self.batch_size).repeat() + + def create_model(self, layers=DEFAULT_LAYERS, out_activation=DEFAULT_OUT_ACTIVATION): + params = { + 'kernel_initializer': 'lecun_uniform', + 'bias_initializer': 'zeros', + # 'kernel_regularizer': keras.regularizers.l2(l=0.01) + 'input_shape': [self.features] + } + + activation = tf.nn.elu + + logger.debug('Model layer parameters: %s', params) + logger.debug('Model layer sizes: %s', layers) + logger.debug('Model layer activation function: %s', activation) + logger.debug('Model out activation function: %s', out_activation) + + + self.model = keras.Sequential([ + keras.layers.Dense(n, activation=activation, **params) for n in layers + ] + [ + keras.layers.Dense(self.output_size, activation=out_activation, **params) + ]) + + def compile(self, loss=DEFAULT_LOSS, metrics=['accuracy'], optimizer=DEFAULT_OPTIMIZER): + logger.debug('Model loss function: %s', loss) + logger.debug('Model optimizer: %s', optimizer) + logger.debug('Model metrics: %s', metrics) + + self.model.compile(loss=loss, + optimizer=optimizer, + metrics=metrics) + + def restore(self, path): + logger.debug('Restoring model weights from path: %s', path) + return self.model.load_weights(path) + + def save(self, path): + logger.debug('Saving model weights to path: %s', path) + self.model.save_weights(path) + return path + + def evaluate(self): + return self.model.evaluate( + self.test, + batch_size=self.batch_size, + steps=int(self.dataset_size / self.batch_size), + verbose=1 + ) + + def evaluate_print(self): + loss, accuracy = self.evaluate() + print('Test evaluation: loss: {}, accuracy: {}'.format(loss, accuracy)) + + def train(self, config): + self.model.summary() + + # map_callback = MapHistory() + + out = self.model.fit( + self.training_batched, + batch_size=self.batch_size, + epochs=self.epochs, + steps_per_epoch=int(self.TRAIN_SIZE / self.batch_size), + class_weight=self.class_weight, + validation_data=self.test_batched, + validation_steps=int(self.TEST_SIZE / self.batch_size), + verbose=1 + ) + + return out + + def predict(self, a): + return np.argmax(self.model.predict(a), axis=1) + + def prepare_for_use(self, df=None, batch_size=DEFAULT_BUFFER_SIZE, layers=DEFAULT_LAYERS, out_activation=DEFAULT_OUT_ACTIVATION, loss=DEFAULT_LOSS, optimizer=DEFAULT_OPTIMIZER): + if df is None: + df = pd.read_pickle('data.p') + self.prepare_dataset(df, dataframe_to_dataset_biomes, batch_size=batch_size) + self.create_model(layers=layers, out_activation=out_activation) + self.compile(loss=loss, optimizer=optimizer) + diff --git a/nn.py b/nn.py deleted file mode 100644 index 0f6dc0d..0000000 --- a/nn.py +++ /dev/null @@ -1,149 +0,0 @@ -from __future__ import absolute_import, division, print_function - -# TensorFlow and tf.keras -import tensorflow as tf -from tensorflow import keras - -# Helper libraries -import numpy as np -import matplotlib.pyplot as plt -import pandas as pd -import os.path - -from utils import * -# from predict import predicted_map - -RANDOM_SEED = 1 - -print(tf.__version__) - -# tf.enable_eager_execution() - -tf.set_random_seed(RANDOM_SEED) -np.random.seed(RANDOM_SEED) - -df = pd.read_pickle('data.p') - -class MapHistory(keras.callbacks.Callback): - def on_epoch_end(self, epoch, logs): - print('EPOCH', epoch) - predicted_map('maps/{}'.format(epoch)) - -class Model(): - def __init__(self, name, batch_size=16, shuffle_buffer_size=500, learning_rate=0.001, epochs=1): - self.name = name - self.path = "checkpoints/{}.hdf5".format(name) - - self.batch_size = batch_size - self.shuffle_buffer_size = shuffle_buffer_size - self.learning_rate = learning_rate - self.epochs = epochs - - def prepare_dataset(self, df, fn): - self.dataset_fn = fn - dataset_size, features, output_size, dataset = fn(df) - self.dataset = dataset.shuffle(self.shuffle_buffer_size) - self.TRAIN_SIZE = int(dataset_size * 0.85) - self.TEST_SIZE = dataset_size - self.TRAIN_SIZE - (training, test) = (self.dataset.take(self.TRAIN_SIZE).batch(self.batch_size).repeat(), - self.dataset.skip(self.TRAIN_SIZE).batch(self.batch_size).repeat()) - - # print(df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] })) - - print('dataset: size={}, train={}, test={}'.format(dataset_size, self.TRAIN_SIZE, self.TEST_SIZE)) - print('input_size={}'.format(features)) - - self.dataset_size = dataset_size - self.features = features - self.output_size = output_size - self.training = training - self.test = test - - def create_model(self, layers, out_activation=None): - params = { - 'kernel_initializer': 'lecun_uniform', - 'bias_initializer': 'zeros', - # 'kernel_regularizer': keras.regularizers.l2(l=0.01) - } - dropout = [keras.layers.Dropout(0.1, input_shape=[self.features])] - # dropout = [] - self.model = keras.Sequential(dropout + [ - keras.layers.Dense(layers[0], activation=tf.nn.elu, **params) - ] + [ - keras.layers.Dense(n, activation=tf.nn.elu, **params) for n in layers[1:] - ] + [ - keras.layers.Dense(self.output_size, activation=out_activation, **params) - ]) - - def compile(self, loss='mse', metrics=['accuracy'], optimizer=tf.train.AdamOptimizer, load_weights=True): - if load_weights: - self.model.load_weights(self.path) - - optimizer = optimizer(self.learning_rate) - - self.model.compile(loss=loss, - optimizer=optimizer, - metrics=metrics) - - def evaluate(self): - return self.model.evaluate( - self.test, - batch_size=self.batch_size, - steps=int(self.dataset_size / self.batch_size), - verbose=1 - ) - - def evaluate_print(self): - loss, accuracy = self.evaluate() - print('Test evaluation: loss: {}, accuracy: {}'.format(loss, accuracy)) - - def train(self): - self.model.summary() - - checkpoint = keras.callbacks.ModelCheckpoint(self.path, monitor='val_loss', verbose=1, mode='min', save_best_only=True) - tensorboard = keras.callbacks.TensorBoard(log_dir='./logs', update_freq='epoch') - # reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001) - # map_callback = MapHistory() - - self.model.fit( - self.training, - batch_size=self.batch_size, - epochs=self.epochs, - steps_per_epoch=int(self.TRAIN_SIZE / self.batch_size), - callbacks=[checkpoint, tensorboard], - validation_data=self.test, - validation_steps=int(self.TEST_SIZE / self.batch_size), - verbose=1 - ) - - def predict(self, a): - return np.argmax(self.model.predict(a), axis=1) - -A = Model('a', epochs=2) -B = Model('b', learning_rate=0.0005, epochs=50) - -# 24 so far -def compile_b(): - B.prepare_dataset(df, dataframe_to_dataset_biomes) - B.create_model([12], tf.nn.softmax) - B.compile(loss='sparse_categorical_crossentropy', load_weights=False) - -def compile_a(): - A.prepare_dataset(df, dataframe_to_dataset_temp_precip) - A.create_model([(4, tf.nn.elu)]) - # A.create_model([]) # linear model - A.compile(metrics=['accuracy', 'mae']) - -if __name__ == "__main__": - compile_b() - B.train() - - # for inp, out in B.test.take(1).make_one_shot_iterator(): - # print(inp, out) - - # print(np.unique(nums)) - # print(np.unique(predictions)) - # print('loss: {}, evaluation: {}'.format(*B.evaluate())) - - # compile_a() - # A.train() diff --git a/plot.py b/plot.py deleted file mode 100644 index 8002b4f..0000000 --- a/plot.py +++ /dev/null @@ -1,28 +0,0 @@ -import geopandas -import os -import rasterio -import pandas as pd -from matplotlib import pyplot - -directory = os.path.dirname(os.path.abspath(__file__)) - - -GEODATA = os.path.join(directory, 'geodata') -ECOREGIONS = os.path.join(GEODATA, 'ecoregions', 'Ecoregions2017.shp') -ELEVATION = os.path.join(GEODATA, 'srtm', 'topo30-180.tif') -TEMP = os.path.join(GEODATA, 'air_temp') - -temp = pd.read_csv(os.path.join(TEMP, 'air_temp.2017'), sep='\s+', header=None, names=['longitude', 'latitude', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'november', 'october', 'december', 'yearly_avg']) - -print(temp.head()) - -eco = geopandas.read_file(ECOREGIONS) -elevation = rasterio.open(ELEVATION) - -print(eco.head()) -print(elevation) - -eco.plot() -# rasterio.plot.show(src) -# pyplot.imshow(elevation.read(1)) -pyplot.show() diff --git a/predict.py b/predict.py index d2e17db..8a4fb7a 100644 --- a/predict.py +++ b/predict.py @@ -1,22 +1,20 @@ +import fire import numpy as np from utils import * -from nn import B, compile_b +#from nn import compile_b +from constants import INPUTS +from model import Model from draw import draw -import time -def chunker(seq, size): - return (seq[pos:pos + size] for pos in range(0, len(seq), size)) - - -def predicted_map(path=None): +def predicted_map(B, change=0, path=None): year = MAX_YEAR - 1 df = pd.read_pickle('data.p') - print('TEMPERATURE MODIFICATION OF {}'.format(change)) + logger.info('temperature change of %s', change) - inputs = ['elevation', 'distance_to_water', 'latitude'] + inputs = list(INPUTS) for season in SEASONS: inputs += [ @@ -24,34 +22,37 @@ def predicted_map(path=None): 'precip_{}_{}'.format(season, year) ] - print(inputs) - - # print(inputs) frame = df[inputs + ['longitude']] - # print(frame.head()) + frame_cp = df[inputs + ['longitude']] for season in SEASONS: frame.loc[:, 'temp_{}_{}'.format(season, year)] += change columns = ['latitude', 'longitude', 'biome_num'] new_data = pd.DataFrame(columns=columns) + nframe = pd.DataFrame(columns=frame.columns, data=normalize_ndarray(frame.to_numpy(), frame_cp.to_numpy())) - for i, chunk in enumerate(chunker(frame, B.batch_size)): + for i, (chunk, chunk_original) in enumerate(zip(chunker(nframe, B.batch_size), chunker(frame, B.batch_size))): if chunk.shape[0] < B.batch_size: continue - input_data = normalize_ndarray(chunk.loc[:, inputs].values) + input_data = chunk.loc[:, inputs].values out = B.predict(input_data) f = pd.DataFrame({ - 'longitude': chunk.loc[:, 'longitude'], - 'latitude': chunk.loc[:, 'latitude'], + 'longitude': chunk_original.loc[:, 'longitude'], + 'latitude': chunk_original.loc[:, 'latitude'], 'biome_num': out }, columns=columns) new_data = new_data.append(f) draw(new_data, path=path) -if __name__ == "__main__": - compile_b() - predicted_map() +def predicted_map_cmd(checkpoint='checkpoints/save.h5', change=0, path=None): + B = Model('b', epochs=1) + B.prepare_for_use() + B.restore(checkpoint) + predicted_map(B, change=change, path=path) + +if __name__ == "__main__": + fire.Fire(predicted_map_cmd) diff --git a/requirements.txt b/requirements.txt index 3cc402f..f98b119 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,7 @@ rasterio==1.0.15 tensorflow==1.13.1 Cartopy==0.17.0 numpy==1.16.1 +scikit-learn==0.20.3 +https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-manylinux1_x86_64.whl +fire==0.1.3 +psutil==5.6.1 diff --git a/tracks b/tracks deleted file mode 100644 index 2ebbd79..0000000 --- a/tracks +++ /dev/null @@ -1,89 +0,0 @@ -Layer (type) Output Shape Param # -================================================================= -Group 1 ------------------------------------------------------------------ -dense (Dense) (None, 128) 1536 -_________________________________________________________________ -dense_1 (Dense) (None, 256) 33024 -_________________________________________________________________ -dense_2 (Dense) (None, 14) 3598 ------------------------------------------------------------------ -Total params: 38,158 -1 Epoch: loss: 0.3822 - acc: 0.8684 -Learning rate: 0.005 -================================================================= - -Group 2 ------------------------------------------------------------------ -dense (Dense) (None, 32) 384 -_________________________________________________________________ -dense_1 (Dense) (None, 64) 2112 -_________________________________________________________________ -dense_2 (Dense) (None, 32) 2080 -_________________________________________________________________ -dense_3 (Dense) (None, 14) 462 ------------------------------------------------------------------ -Total params: 5,038 -1 Epoch: loss: 0.3760 - acc: 0.8678 @ 20minutes -Stopped converging, loss increasing -Learning rate: 0.005 -================================================================= - -Group 3 ------------------------------------------------------------------ -dense (Dense) (None, 16) 192 -_________________________________________________________________ -dense_1 (Dense) (None, 32) 544 -_________________________________________________________________ -dense_2 (Dense) (None, 16) 528 -_________________________________________________________________ -dense_3 (Dense) (None, 14) 238 ------------------------------------------------------------------ -Total params: 1,502 -1 Epoch: loss: 0.3702 - acc: 0.8671 @ 12minutes -10 Epochs: loss: 0.3280 - acc: 0.8815 -Stopped converging after 5 epochs, was oscillating -Learning rate: 0.005 -================================================================= - -Group 4 -_________________________________________________________________ -dense (Dense) (None, 12) 144 -_________________________________________________________________ -dense_1 (Dense) (None, 14) 182 -_________________________________________________________________ -Total params: 326 -1 Epoch: loss: 0.4412 - acc: 0.8457 @ 10m -60 Epochs: loss: 0.4146 - acc: 0.8546 -Stopped converging -Learning rate: 0.005 -================================================================= - -Group 5 -_________________________________________________________________ -dense (Dense) (None, 12) 144 -_________________________________________________________________ -dense_1 (Dense) (None, 14) 182 -_________________________________________________________________ -Total params: 326 -1 Epoch: loss: 0.5057 - acc: 0.8268 @ 10m -15 epoch: loss: 0.4240 - acc: 0.8481 -Stopped converging -Learning rate: 0.001 -================================================================= - -Group 6 -_________________________________________________________________ -Layer (type) Output Shape Param # -================================================================= -dense (Dense) (None, 24) 288 -_________________________________________________________________ -dense_1 (Dense) (None, 14) 350 -_________________________________________________________________ -Total params: 638 -1 Epoch: loss: 0.4520 - acc: 0.8416 @ 12m -30 epochs: loss: 0.3562 - acc: 0.8691, still converging -stopped converging after 100 epochs -Learning rate: 0.001 - - diff --git a/train.py b/train.py new file mode 100644 index 0000000..dc885ad --- /dev/null +++ b/train.py @@ -0,0 +1,67 @@ +import fire +import ray +import pandas as pd +import tensorflow as tf +from ray import tune +from tensorflow import keras +from utils import logger +from model import Model + +B_params = { + 'batch_size': tune.grid_search([256]), + 'layers': tune.grid_search([[512, 512]]), + 'lr': tune.grid_search([1e-4]), + 'optimizer': tune.grid_search([tf.keras.optimizers.Adam]), +} + +df = pd.read_pickle('data.p') + +class TuneB(tune.Trainable): + def _setup(self, config): + logger.debug('Ray Tune model configuration %s', config) + + self.model = Model('b', epochs=1) + + optimizer = config['optimizer'] + optimizer = config['optimizer'](lr=config['lr']) + + self.model.prepare_for_use(df=df, batch_size=config['batch_size'], layers=config['layers'], optimizer=optimizer) + + def _train(self): + logs = self.model.train(self.config) + + metrics = { + 'mean_accuracy': logs.history['acc'][0], + 'loss': logs.history['loss'][0], + 'val_accuracy': logs.history['val_acc'][0], + 'val_loss': logs.history['val_loss'][0], + } + + return metrics + + def _save(self, checkpoint_dir): + return self.model.save(checkpoint_dir) + + def _restore(self, path): + return self.model.restore(path) + +def start_tuning(cpu=1, gpu=2, checkpoint_freq=1, checkpoint_at_end=True, resume=False, restore=None, stop=500): + ray.init() + + tune.run(TuneB, + config=B_params, + resources_per_trial={ + "cpu": cpu, + "gpu": gpu + }, + resume=resume, + checkpoint_at_end=checkpoint_at_end, + checkpoint_freq=checkpoint_freq, + restore=restore, + stop={ + 'training_iteration': stop + }) + + +if __name__ == "__main__": + fire.Fire(start_tuning) diff --git a/utils.py b/utils.py index 116aae3..220046c 100644 --- a/utils.py +++ b/utils.py @@ -1,55 +1,46 @@ import numpy as np import tensorflow as tf import pandas as pd +from collections import Counter +from sklearn.utils import class_weight from constants import * +import logging +import os -inputs = ['elevation', 'distance_to_water', 'latitude'] -output = 'biome_num' +logger = logging.getLogger('main') +logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO')) -def normalize(v): - return (v - np.mean(v)) / np.std(v) -def normalize_ndarray(ar): +def normalize(v, o=None): + if o is None: + o = v + return (v - np.mean(o)) / np.std(o) + +def normalize_ndarray(ar, o=None): + if o is None: + o = ar + + # transpose: operate over columns tr = np.transpose(ar) + to = np.transpose(o) for i in range(tr.shape[0]): - tr[i] = normalize(tr[i]) + tr[i] = normalize(tr[i], to[i]) + # transpose back return np.transpose(tr) -def normalize_df(df): - for col in df.columns: - df.loc[col] = normalize_ndarray(df[col]) - - return df - def dataframe_to_dataset_biomes(df): rows = df.shape[0] # 8 for seasonal temp and precipitation # 3 for latitude, elevation and distance_to_water - columns = 11 + input_columns = 11 - # make biomes uniformly distributed so each biome has enough data to avoid a biased dataset - biome_shares = df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] }) - max_share = np.max(biome_shares['biome_num']) - dsize = df.shape[0] - max_share_count = int(max_share * dsize) - - for biome_num in biome_shares.index: - share = biome_shares.values[biome_num][0] - share_count = int(share * dsize) - diff = max_share_count - share_count - rows = df.loc[df['biome_num'] == biome_num] - diff_ratio = int(diff / rows.shape[0]) - df = pd.concat([df] + [rows] * diff_ratio, ignore_index=True) - - # print(df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] })) - - tf_inputs = np.empty((0, columns)) + tf_inputs = np.empty((0, input_columns)) tf_output = np.empty((0)) for year in range(MIN_YEAR, MAX_YEAR + 1): - local_inputs = list(inputs) + local_inputs = list(INPUTS) for season in SEASONS: local_inputs += [ 'temp_{}_{}'.format(season, year), @@ -60,25 +51,32 @@ def dataframe_to_dataset_biomes(df): local_df = df[local_inputs] tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0) - tf_output = np.concatenate((tf_output, df[output].values), axis=0) + tf_output = np.concatenate((tf_output, df[OUTPUT].values), axis=0) + + # balance class weights for the loss function, since the data is highly unbalanced + num_classes = len(np.unique(tf_output)) + class_weights = class_weight.compute_class_weight('balanced', np.unique(tf_output), tf_output) + logger.debug('class_weights %s', class_weights) tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32) tf_output = tf.cast(tf_output, tf.int64) - return int(tf_inputs.shape[0]), 11, 14, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) + logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes) + return int(tf_inputs.shape[0]), input_columns, num_classes, class_weights, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) def dataframe_to_dataset_temp_precip(df): rows = df.shape[0] # elevation, distance_to_water, latitude # season, year - columns = 5 + input_columns = 5 + num_classes = 2 - tf_inputs = np.empty((0, columns)) - tf_output = np.empty((0, 2)) + tf_inputs = np.empty((0, input_columns)) + tf_output = np.empty((0, num_classes)) for year in range(MIN_YEAR, MAX_YEAR + 1): - local_inputs = list(inputs) + local_inputs = list(INPUTS) for idx, season in enumerate(SEASONS): season_index = idx / len(season) @@ -93,8 +91,11 @@ def dataframe_to_dataset_temp_precip(df): tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32) tf_output = tf.cast(tf_output, tf.float32) - return int(tf_inputs.shape[0]), 5, 2, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) + logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes) + return int(tf_inputs.shape[0]), input_columns, num_classes, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) -# df = pd.read_pickle('data.p') -# print(dataframe_to_dataset_biomes(df)) +flatten = lambda l: [item for sublist in l for item in sublist] + +def chunker(seq, size): + return (seq[pos:pos + size] for pos in range(0, len(seq), size))