diff --git a/.floydexpt b/.floydexpt new file mode 100644 index 0000000..c12b4a2 --- /dev/null +++ b/.floydexpt @@ -0,0 +1 @@ +{"name": "world", "namespace": "mdibaiee", "family_id": "prj_HzeYYJXLyy2otH6W"} \ No newline at end of file diff --git a/.floydignore b/.floydignore new file mode 100644 index 0000000..d388b74 --- /dev/null +++ b/.floydignore @@ -0,0 +1,19 @@ +maps +logs +checkpoints.* +geodata +*.p + +# Directories and files to ignore when uploading code to floyd + +.git +.eggs +eggs +lib +lib64 +parts +sdist +var +*.pyc +*.swp +.DS_Store diff --git a/checkpoints/b.hdf5 b/checkpoints/b.hdf5 index 86e478f..ef05303 100644 Binary files a/checkpoints/b.hdf5 and b/checkpoints/b.hdf5 differ diff --git a/data.py b/data.py index aa86898..9164740 100644 --- a/data.py +++ b/data.py @@ -46,9 +46,8 @@ for year in range(MIN_YEAR, MAX_YEAR + 1): for s in SEASONS: temp_precip_columns += ['temp_{}_{}'.format(s, year), 'precip_{}_{}'.format(s, year)] -columns = ['biome_num', 'biome_name', 'elevation', 'distance_to_water'] + temp_precip_columns -indices = ['longitude', 'latitude'] -final_data = pd.DataFrame(index=indices, columns=columns) +columns = ['longitude', 'latitude', 'biome_num', 'biome_name', 'elevation', 'distance_to_water'] + temp_precip_columns +final_data = pd.DataFrame(columns=columns) def get_point_information(longitude, latitude): item = {} @@ -57,6 +56,8 @@ def get_point_information(longitude, latitude): if ecoregion.empty: return False + item['longitude'] = longitude + item['latitude'] = latitude item['biome_num'] = ecoregion.BIOME_NUM.iloc[0] item['biome_name'] = ecoregion.BIOME_NAME.iloc[0] @@ -100,18 +101,18 @@ def get_point_information(longitude, latitude): return item -data_indices = [] -data_map = {} +data = {} for col in columns: - data_map[col] = {} + data[col] = [] -i = 0 +# i = 0 start_time = time.time() for longitude in range(-179, 179): print('-', end='') for latitude in range(-89, 89): + # generate data and save to file d = get_point_information(longitude, latitude) if d == False: @@ -119,7 +120,7 @@ for longitude in range(-179, 179): continue for key, value in d.items(): - data_map[key][(longitude, latitude)] = value + data[key].append(value) print('+', end='') @@ -128,7 +129,7 @@ for longitude in range(-179, 179): print("--- Calculations: %s seconds ---" % (time.time() - start_time)) start_time = time.time() -df = pd.DataFrame(data_map) +df = pd.DataFrame(data) print("--- Generating DataFrame: %s seconds ---" % (time.time() - start_time)) print(df) start_time = time.time() diff --git a/demo.py b/demo.py new file mode 100644 index 0000000..2750630 --- /dev/null +++ b/demo.py @@ -0,0 +1,19 @@ +import pandas as pd +from utils import * + +df = pd.read_pickle('data_final.p') +df.to_csv('data_final.csv') + +print('DataFrame:') +print(df) + +dataset_size, features, output_size, _ = dataframe_to_dataset_biomes(df) +print('Biomes dataset:\n - size: {}\n - inputs: {}\n - outputs: {}\n'.format(dataset_size, features, output_size)) + +dataset_size, features, output_size, _ = dataframe_to_dataset_temp_precip(df) +print('Temp/Precip dataset:\n - size: {}\n - inputs: {}\n - outputs: {}\n'.format(dataset_size, features, output_size)) + +# print('Normalized Data:') +# print(normalize_df(df)) + +# normalize_df(df).to_csv('data_normalized.csv') diff --git a/draw.py b/draw.py index 8dde7c5..0fe46a6 100644 --- a/draw.py +++ b/draw.py @@ -10,8 +10,8 @@ def draw(df, path=None): biome_numbers = df['biome_num'].unique() # biome_names = df['biome_name'].unique() - for (longitude, latitude), row in df.iterrows(): - p = Point(longitude, latitude) + for i, row in df.iterrows(): + p = Point(row.longitude, row.latitude) if row.biome_num in biomes: biomes[row.biome_num].append(p) else: @@ -55,5 +55,5 @@ def draw(df, path=None): plt.show() if __name__ == "__main__": - df = pd.read_pickle('data_final.p') + df = pd.read_pickle('data.p') draw(df) diff --git a/floyd.yml b/floyd.yml new file mode 100644 index 0000000..4c5c966 --- /dev/null +++ b/floyd.yml @@ -0,0 +1,23 @@ +# see: https://docs.floydhub.com/floyd_config +# All supported configs: +# +#machine: cpu +#env: tensorflow-1.8 +#input: +# - destination: input +# source: foo/datasets/yelp-food/1 +# - foo/datasets/yelp-food-test/1:test +#description: this is a test +#max_runtime: 3600 +#command: python train.py + +# You can also define multiple tasks to use with --task argument: +# +#task: +# evaluate: +# machine: gpu +# command: python evaluate.py +# +# serve: +# machine: cpu +# mode: serve diff --git a/nn.py b/nn.py index 4618307..c10bab7 100644 --- a/nn.py +++ b/nn.py @@ -14,12 +14,14 @@ from utils import * RANDOM_SEED = 1 -tf.enable_eager_execution() +print(tf.__version__) + +# tf.enable_eager_execution() tf.set_random_seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) -df = pd.read_pickle('data_final.p') +df = pd.read_pickle('data.p') class Model(): def __init__(self, name, batch_size=16, shuffle_buffer_size=500, learning_rate=0.001, epochs=1): @@ -40,17 +42,21 @@ class Model(): (training, test) = (self.dataset.take(self.TRAIN_SIZE).batch(self.batch_size).repeat(), self.dataset.skip(self.TRAIN_SIZE).batch(self.batch_size).repeat()) + print('dataset: size={}, train={}, test={}'.format(dataset_size, self.TRAIN_SIZE, self.TEST_SIZE)) + print('input_size={}'.format(features)) + self.dataset_size = dataset_size self.features = features self.output_size = output_size self.training = training self.test = test - def create_model(self, layers, out_activation): + def create_model(self, layers, out_activation=None): params = { 'kernel_initializer': 'lecun_uniform', 'bias_initializer': 'zeros', } + # dropout = keras.layersDropout(0.2, input_shape=[self.features]) self.model = keras.Sequential([ keras.layers.Dense(layers[0], activation=tf.nn.elu, input_shape=[self.features], **params) ] + [ @@ -60,7 +66,7 @@ class Model(): ]) def compile(self, loss='mse', metrics=['accuracy'], optimizer=tf.train.AdamOptimizer): - # self.model.load_weights(self.path) + self.model.load_weights(self.path) optimizer = optimizer(self.learning_rate) self.model.compile(loss=loss, @@ -79,7 +85,7 @@ class Model(): self.model.summary() checkpoint = keras.callbacks.ModelCheckpoint(self.path, monitor='acc', verbose=1, mode='max') - tensorboard = keras.callbacks.TensorBoard(log_dir='./logs') + tensorboard = keras.callbacks.TensorBoard(log_dir='./logs', update_freq='epoch') # map_callback = keras.callbacks.LambdaCallback(on_epoch_end=self.map_callback) self.model.fit( @@ -95,16 +101,17 @@ class Model(): return np.argmax(self.model.predict(a), axis=1) A = Model('a', epochs=2) -B = Model('b', learning_rate=0.005, epochs=100) +B = Model('b', learning_rate=0.001, epochs=450) def compile_b(): B.prepare_dataset(df, dataframe_to_dataset_biomes) - B.create_model([64, 128], tf.nn.softmax) + B.create_model([32], tf.nn.softmax) B.compile(loss='sparse_categorical_crossentropy') def compile_a(): A.prepare_dataset(df, dataframe_to_dataset_temp_precip) A.create_model([(4, tf.nn.elu)]) + # A.create_model([]) # linear model A.compile(metrics=['accuracy', 'mae']) if __name__ == "__main__": @@ -118,5 +125,5 @@ if __name__ == "__main__": # print(np.unique(predictions)) # print('loss: {}, evaluation: {}'.format(*B.evaluate())) - compile_a() - A.train() + # compile_a() + # A.train() diff --git a/predict.py b/predict.py index eb93ee3..a2f2960 100644 --- a/predict.py +++ b/predict.py @@ -1,7 +1,7 @@ import numpy as np from utils import * -from nn import B +from nn import B, compile_b from draw import draw import time @@ -10,16 +10,14 @@ def chunker(seq, size): year = MAX_YEAR - 1 -df = pd.read_pickle('data_final.p') -latitude = np.array(df.index.get_level_values(1)) -df.loc[:, 'latitude'] = pd.Series(latitude, index=df.index) +df = pd.read_pickle('data.p') compile_b() for change in range(0, 1): print('TEMPERATURE MODIFICATION OF {}'.format(change)) - inputs = ['elevation', 'distance_to_water'] + inputs = ['latitude', 'longitude', 'elevation', 'distance_to_water'] for season in SEASONS: inputs += [ @@ -27,22 +25,28 @@ for change in range(0, 1): 'precip_{}_{}'.format(season, year) ] - inputs += ['latitude'] - + # print(inputs) frame = df[inputs] - print(frame.head()) + # print(frame.head()) for season in SEASONS: frame.loc[:, 'temp_{}_{}'.format(season, year)] += change - columns = ['biome_num'] + columns = ['latitude', 'longitude', 'biome_num'] new_data = pd.DataFrame(columns=columns) + for i, chunk in enumerate(chunker(frame, B.batch_size)): - input_data = normalize_ndarray(chunk.values) + if chunk.shape[0] < B.batch_size: + continue + input_data = normalize_ndarray(chunk.loc[:, chunk.columns != 'longitude'].values) out = B.predict(input_data) - new_index = np.concatenate((chunk.index.values, new_data.index.values)) - new_data = new_data.reindex(new_index) - new_data.loc[chunk.index.values, 'biome_num'] = out + f = pd.DataFrame({ + 'longitude': chunk.loc[:, 'longitude'], + 'latitude': chunk.loc[:, 'latitude'], + 'biome_num': out + }, columns=columns) + new_data = new_data.append(f) + print(new_data) draw(new_data) diff --git a/requirements.txt b/requirements.txt index 7586e77..3cc402f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,6 @@ matplotlib==3.0.2 descartes==1.1.0 pysal==2.0.0 rasterio==1.0.15 -tensorflow==1.12.0 +tensorflow==1.13.1 Cartopy==0.17.0 numpy==1.16.1 diff --git a/tracks b/tracks new file mode 100644 index 0000000..2ebbd79 --- /dev/null +++ b/tracks @@ -0,0 +1,89 @@ +Layer (type) Output Shape Param # +================================================================= +Group 1 +----------------------------------------------------------------- +dense (Dense) (None, 128) 1536 +_________________________________________________________________ +dense_1 (Dense) (None, 256) 33024 +_________________________________________________________________ +dense_2 (Dense) (None, 14) 3598 +----------------------------------------------------------------- +Total params: 38,158 +1 Epoch: loss: 0.3822 - acc: 0.8684 +Learning rate: 0.005 +================================================================= + +Group 2 +----------------------------------------------------------------- +dense (Dense) (None, 32) 384 +_________________________________________________________________ +dense_1 (Dense) (None, 64) 2112 +_________________________________________________________________ +dense_2 (Dense) (None, 32) 2080 +_________________________________________________________________ +dense_3 (Dense) (None, 14) 462 +----------------------------------------------------------------- +Total params: 5,038 +1 Epoch: loss: 0.3760 - acc: 0.8678 @ 20minutes +Stopped converging, loss increasing +Learning rate: 0.005 +================================================================= + +Group 3 +----------------------------------------------------------------- +dense (Dense) (None, 16) 192 +_________________________________________________________________ +dense_1 (Dense) (None, 32) 544 +_________________________________________________________________ +dense_2 (Dense) (None, 16) 528 +_________________________________________________________________ +dense_3 (Dense) (None, 14) 238 +----------------------------------------------------------------- +Total params: 1,502 +1 Epoch: loss: 0.3702 - acc: 0.8671 @ 12minutes +10 Epochs: loss: 0.3280 - acc: 0.8815 +Stopped converging after 5 epochs, was oscillating +Learning rate: 0.005 +================================================================= + +Group 4 +_________________________________________________________________ +dense (Dense) (None, 12) 144 +_________________________________________________________________ +dense_1 (Dense) (None, 14) 182 +_________________________________________________________________ +Total params: 326 +1 Epoch: loss: 0.4412 - acc: 0.8457 @ 10m +60 Epochs: loss: 0.4146 - acc: 0.8546 +Stopped converging +Learning rate: 0.005 +================================================================= + +Group 5 +_________________________________________________________________ +dense (Dense) (None, 12) 144 +_________________________________________________________________ +dense_1 (Dense) (None, 14) 182 +_________________________________________________________________ +Total params: 326 +1 Epoch: loss: 0.5057 - acc: 0.8268 @ 10m +15 epoch: loss: 0.4240 - acc: 0.8481 +Stopped converging +Learning rate: 0.001 +================================================================= + +Group 6 +_________________________________________________________________ +Layer (type) Output Shape Param # +================================================================= +dense (Dense) (None, 24) 288 +_________________________________________________________________ +dense_1 (Dense) (None, 14) 350 +_________________________________________________________________ +Total params: 638 +1 Epoch: loss: 0.4520 - acc: 0.8416 @ 12m +30 epochs: loss: 0.3562 - acc: 0.8691, still converging +stopped converging after 100 epochs +Learning rate: 0.001 + + diff --git a/utils.py b/utils.py index b2804f2..116aae3 100644 --- a/utils.py +++ b/utils.py @@ -3,7 +3,7 @@ import tensorflow as tf import pandas as pd from constants import * -inputs = ['elevation', 'distance_to_water'] +inputs = ['elevation', 'distance_to_water', 'latitude'] output = 'biome_num' def normalize(v): @@ -18,7 +18,7 @@ def normalize_ndarray(ar): def normalize_df(df): for col in df.columns: - df.loc[col] = normalize(df[col]) + df.loc[col] = normalize_ndarray(df[col]) return df @@ -29,9 +29,24 @@ def dataframe_to_dataset_biomes(df): # 3 for latitude, elevation and distance_to_water columns = 11 + # make biomes uniformly distributed so each biome has enough data to avoid a biased dataset + biome_shares = df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] }) + max_share = np.max(biome_shares['biome_num']) + dsize = df.shape[0] + max_share_count = int(max_share * dsize) + + for biome_num in biome_shares.index: + share = biome_shares.values[biome_num][0] + share_count = int(share * dsize) + diff = max_share_count - share_count + rows = df.loc[df['biome_num'] == biome_num] + diff_ratio = int(diff / rows.shape[0]) + df = pd.concat([df] + [rows] * diff_ratio, ignore_index=True) + + # print(df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] })) + tf_inputs = np.empty((0, columns)) tf_output = np.empty((0)) - latitude = np.array(df.index.get_level_values(1)) for year in range(MIN_YEAR, MAX_YEAR + 1): local_inputs = list(inputs) @@ -43,7 +58,6 @@ def dataframe_to_dataset_biomes(df): local_df = df[local_inputs] - local_df.loc[:, 'latitude'] = pd.Series(latitude, index=local_df.index) tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0) tf_output = np.concatenate((tf_output, df[output].values), axis=0) @@ -62,7 +76,6 @@ def dataframe_to_dataset_temp_precip(df): tf_inputs = np.empty((0, columns)) tf_output = np.empty((0, 2)) - latitude = np.array(df.index.get_level_values(1)) for year in range(MIN_YEAR, MAX_YEAR + 1): local_inputs = list(inputs) @@ -70,7 +83,6 @@ def dataframe_to_dataset_temp_precip(df): for idx, season in enumerate(SEASONS): season_index = idx / len(season) local_df = df[local_inputs] - local_df.loc[:, 'latitude'] = pd.Series(latitude, index=local_df.index) local_df.loc[:, 'season'] = pd.Series(np.repeat(season_index, rows), index=local_df.index) local_df.loc[:, 'year'] = pd.Series(np.repeat(year, rows), index=local_df.index) @@ -79,7 +91,10 @@ def dataframe_to_dataset_temp_precip(df): tf_output = np.concatenate((tf_output, df[output].values), axis=0) tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32) - tf_output = tf.cast(normalize_ndarray(tf_output), tf.float32) + tf_output = tf.cast(tf_output, tf.float32) return int(tf_inputs.shape[0]), 5, 2, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) + +# df = pd.read_pickle('data.p') +# print(dataframe_to_dataset_biomes(df))