refactor: working version with command-line utilities

This commit is contained in:
Mahdi Dibaiee 2019-03-31 09:52:00 +04:30
parent fe3f539d7d
commit e3e3fecf4d
16 changed files with 361 additions and 403 deletions

2
.gitignore vendored
View File

@ -1,6 +1,6 @@
maps maps
logs logs
checkpoints.* checkpoints
geodata geodata
*.p *.p
#### joe made this: http://goel.io/joe #### joe made this: http://goel.io/joe

View File

@ -2,5 +2,7 @@
pyenv install $(cat .python-version) pyenv install $(cat .python-version)
pyenv local pyenv local
pip install -r requirements.txt pip install -r requirements.txt
apt install proj-bin libproj-dev # https://proj4.org/install.html#install
apt install libgeos-3.6.2 libgeos-dev libgeos++-dev # https://packages.ubuntu.com/search?keywords=geos&searchon=sourcenames&suite=all&section=all
``` ```

Binary file not shown.

Binary file not shown.

View File

@ -16,3 +16,66 @@ WINTER_MONTHS = ['december', 'january', 'february']
SPRING_MONTHS = ['march', 'april', 'may'] SPRING_MONTHS = ['march', 'april', 'may']
SUMMER_MONTHS = ['june', 'july', 'august'] SUMMER_MONTHS = ['june', 'july', 'august']
AUTUMN_MONTHS = ['september', 'november', 'october'] AUTUMN_MONTHS = ['september', 'november', 'october']
INPUTS = ['elevation', 'distance_to_water', 'latitude']
OUTPUT = 'biome_num'
BIOMES = [
{
'name': 'Tropical & Subtropical Moist Broadleaf Forests',
'color': '#016936',
},
{
'name': 'Tropical & Subtropical Dry Broadleaf Forests',
'color': '#B2D127',
},
{
'name': 'Tropical & Subtropical Coniferous Forests',
'color': '#77CC00',
},
{
'name': 'Temperate Broadleaf & Mixed Forests',
'color': '#99C500',
},
{
'name': 'Temperate Conifer Forests',
'color': '#B6CC00',
},
{
'name': 'Boreal Forests/Taiga',
'color': '#00C5B5',
},
{
'name': 'Tropical & Subtropical Grasslands, Savannas & Shrublands',
'color': '#EFFF00',
},
{
'name': 'Temperate Grasslands, Savannas & Shrublands',
'color': '#FFEE00',
},
{
'name': 'Flooded Grasslands & Savannas',
'color': '#009BFF',
},
{
'name': 'Montane Grasslands & Shrublands',
'color': '#A0ADBA',
},
{
'name': 'Tundra',
'color': '#5C62FF',
},
{
'name': 'Mediterranean Forests, Woodlands & Scrub',
'color': '#00850F',
},
{
'name': 'Deserts & Xeric Shrublands',
'color': '#FF9E1F',
},
{
'name': 'Mangroves',
'color': '#FF1F97'
}
]

19
demo.py
View File

@ -1,19 +0,0 @@
import pandas as pd
from utils import *
df = pd.read_pickle('data_final.p')
df.to_csv('data_final.csv')
print('DataFrame:')
print(df)
dataset_size, features, output_size, _ = dataframe_to_dataset_biomes(df)
print('Biomes dataset:\n - size: {}\n - inputs: {}\n - outputs: {}\n'.format(dataset_size, features, output_size))
dataset_size, features, output_size, _ = dataframe_to_dataset_temp_precip(df)
print('Temp/Precip dataset:\n - size: {}\n - inputs: {}\n - outputs: {}\n'.format(dataset_size, features, output_size))
# print('Normalized Data:')
# print(normalize_df(df))
# normalize_df(df).to_csv('data_normalized.csv')

52
draw.py
View File

@ -1,59 +1,43 @@
from shapely.geometry import Point, MultiPoint import fire
from shapely.ops import cascaded_union
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from utils import logger
from constants import BIOMES
import pandas as pd import pandas as pd
import cartopy.crs as ccrs import cartopy.crs as ccrs
def draw(df, path=None): def draw(df, path=None):
logger.debug('draw(df, %s)', path)
biomes = {} biomes = {}
biome_numbers = df['biome_num'].unique() biome_numbers = df['biome_num'].unique()
# biome_names = df['biome_name'].unique()
for i, row in df.iterrows(): for i, row in df.iterrows():
p = Point(row.longitude, row.latitude)
if row.biome_num in biomes: if row.biome_num in biomes:
biomes[row.biome_num].append(p) biomes[row.biome_num]['x'].append(row.longitude)
biomes[row.biome_num]['y'].append(row.latitude)
else: else:
biomes[row.biome_num] = [p] biomes[row.biome_num] = { 'x': [row.longitude], 'y': [row.latitude] }
ax = plt.axes(projection=ccrs.PlateCarree()) ax = plt.axes(projection=ccrs.PlateCarree())
ax.stock_img() ax.stock_img()
# ax.legend(df['biome_name'].unique())
colors={
0: '#016936',
1: '#B2D127',
2: '#77CC00',
3: '#99C500',
4: '#B6CC00',
5: '#00C5B5',
6: '#EFFF00',
7: '#FFEE00',
8: '#009BFF',
9: '#A0ADBA',
10: '#5C62FF',
11: '#00850F',
12: '#FF9E1F',
13: '#FF1F97'
}
for n in biome_numbers: for n in biome_numbers:
biomes[n] = MultiPoint(biomes[n]).buffer(0.5) xs = biomes[n]['x']
# print(biomes[n]) ys = biomes[n]['y']
# legend = biome_names[n] scatter = ax.scatter(xs, ys, s=4, c=BIOMES[n]['color'], transform=ccrs.PlateCarree())
if not hasattr(biomes[n], '__iter__'): scatter.set_label(BIOMES[n]['name'])
biomes[n] = [biomes[n]]
ax.add_geometries(biomes[n], ccrs.PlateCarree(), facecolor=colors[n])
# artist.set_label(biome_names[n])
# print(artist.get_label())
# ax.legend(artists, biome_names) ax.legend()
figure = plt.gcf()
figure.set_size_inches(20, 18)
if path: if path:
plt.savefig(path) plt.savefig(path)
else: else:
plt.show() plt.show()
def draw_cmd(path=None):
draw(pd.read_pickle('data.p'), path=path)
if __name__ == "__main__": if __name__ == "__main__":
df = pd.read_pickle('data.p') fire.Fire(draw_cmd)
draw(df)

View File

@ -1,23 +0,0 @@
# see: https://docs.floydhub.com/floyd_config
# All supported configs:
#
#machine: cpu
#env: tensorflow-1.8
#input:
# - destination: input
# source: foo/datasets/yelp-food/1
# - foo/datasets/yelp-food-test/1:test
#description: this is a test
#max_runtime: 3600
#command: python train.py
# You can also define multiple tasks to use with --task argument:
#
#task:
# evaluate:
# machine: gpu
# command: python evaluate.py
#
# serve:
# machine: cpu
# mode: serve

144
model.py Normal file
View File

@ -0,0 +1,144 @@
from __future__ import absolute_import, division, print_function
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras
# Helper libraries
import numpy as np
import pandas as pd
from utils import *
RANDOM_SEED = 1
logger.debug('Tensorflow version: %s', tf.__version__)
logger.debug('Random Seed: %s', RANDOM_SEED)
tf.set_random_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
DEFAULT_BATCH_SIZE=256
DEFAULT_LAYERS = [512, 512]
DEFAULT_BUFFER_SIZE=500
DEFAULT_OUT_ACTIVATION = tf.nn.softmax
DEFAULT_LOSS = 'sparse_categorical_crossentropy'
DEFAULT_OPTIMIZER = tf.keras.optimizers.Adam(lr=0.001)
class Model():
def __init__(self, name, epochs=1):
self.name = name
self.path = "checkpoints/{}.hdf5".format(name)
self.epochs = epochs
def prepare_dataset(self, df, fn, **kwargs):
self.dataset_fn = fn
self.set_dataset(*fn(df), **kwargs)
def set_dataset(self, dataset_size, features, output_size, class_weight, dataset, shuffle_buffer_size=DEFAULT_BUFFER_SIZE, batch_size=DEFAULT_BATCH_SIZE):
self.shuffle_buffer_size = shuffle_buffer_size
self.class_weight = class_weight
self.dataset = dataset.shuffle(self.shuffle_buffer_size)
self.TRAIN_SIZE = int(dataset_size * 0.85)
self.TEST_SIZE = dataset_size - self.TRAIN_SIZE
(training, test) = (self.dataset.take(self.TRAIN_SIZE),
self.dataset.skip(self.TRAIN_SIZE))
logger.debug('Model dataset info: size=%s, train=%s, test=%s', dataset_size, self.TRAIN_SIZE, self.TEST_SIZE)
self.dataset_size = dataset_size
self.features = features
self.output_size = output_size
self.training = training
self.test = test
logger.debug('Model input size: %s', self.features)
logger.debug('Model output size: %s', self.output_size)
self.batch_size = batch_size
self.training_batched = self.training.batch(self.batch_size).repeat()
self.test_batched = self.test.batch(self.batch_size).repeat()
def create_model(self, layers=DEFAULT_LAYERS, out_activation=DEFAULT_OUT_ACTIVATION):
params = {
'kernel_initializer': 'lecun_uniform',
'bias_initializer': 'zeros',
# 'kernel_regularizer': keras.regularizers.l2(l=0.01)
'input_shape': [self.features]
}
activation = tf.nn.elu
logger.debug('Model layer parameters: %s', params)
logger.debug('Model layer sizes: %s', layers)
logger.debug('Model layer activation function: %s', activation)
logger.debug('Model out activation function: %s', out_activation)
self.model = keras.Sequential([
keras.layers.Dense(n, activation=activation, **params) for n in layers
] + [
keras.layers.Dense(self.output_size, activation=out_activation, **params)
])
def compile(self, loss=DEFAULT_LOSS, metrics=['accuracy'], optimizer=DEFAULT_OPTIMIZER):
logger.debug('Model loss function: %s', loss)
logger.debug('Model optimizer: %s', optimizer)
logger.debug('Model metrics: %s', metrics)
self.model.compile(loss=loss,
optimizer=optimizer,
metrics=metrics)
def restore(self, path):
logger.debug('Restoring model weights from path: %s', path)
return self.model.load_weights(path)
def save(self, path):
logger.debug('Saving model weights to path: %s', path)
self.model.save_weights(path)
return path
def evaluate(self):
return self.model.evaluate(
self.test,
batch_size=self.batch_size,
steps=int(self.dataset_size / self.batch_size),
verbose=1
)
def evaluate_print(self):
loss, accuracy = self.evaluate()
print('Test evaluation: loss: {}, accuracy: {}'.format(loss, accuracy))
def train(self, config):
self.model.summary()
# map_callback = MapHistory()
out = self.model.fit(
self.training_batched,
batch_size=self.batch_size,
epochs=self.epochs,
steps_per_epoch=int(self.TRAIN_SIZE / self.batch_size),
class_weight=self.class_weight,
validation_data=self.test_batched,
validation_steps=int(self.TEST_SIZE / self.batch_size),
verbose=1
)
return out
def predict(self, a):
return np.argmax(self.model.predict(a), axis=1)
def prepare_for_use(self, df=None, batch_size=DEFAULT_BUFFER_SIZE, layers=DEFAULT_LAYERS, out_activation=DEFAULT_OUT_ACTIVATION, loss=DEFAULT_LOSS, optimizer=DEFAULT_OPTIMIZER):
if df is None:
df = pd.read_pickle('data.p')
self.prepare_dataset(df, dataframe_to_dataset_biomes, batch_size=batch_size)
self.create_model(layers=layers, out_activation=out_activation)
self.compile(loss=loss, optimizer=optimizer)

149
nn.py
View File

@ -1,149 +0,0 @@
from __future__ import absolute_import, division, print_function
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras
# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os.path
from utils import *
# from predict import predicted_map
RANDOM_SEED = 1
print(tf.__version__)
# tf.enable_eager_execution()
tf.set_random_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
df = pd.read_pickle('data.p')
class MapHistory(keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs):
print('EPOCH', epoch)
predicted_map('maps/{}'.format(epoch))
class Model():
def __init__(self, name, batch_size=16, shuffle_buffer_size=500, learning_rate=0.001, epochs=1):
self.name = name
self.path = "checkpoints/{}.hdf5".format(name)
self.batch_size = batch_size
self.shuffle_buffer_size = shuffle_buffer_size
self.learning_rate = learning_rate
self.epochs = epochs
def prepare_dataset(self, df, fn):
self.dataset_fn = fn
dataset_size, features, output_size, dataset = fn(df)
self.dataset = dataset.shuffle(self.shuffle_buffer_size)
self.TRAIN_SIZE = int(dataset_size * 0.85)
self.TEST_SIZE = dataset_size - self.TRAIN_SIZE
(training, test) = (self.dataset.take(self.TRAIN_SIZE).batch(self.batch_size).repeat(),
self.dataset.skip(self.TRAIN_SIZE).batch(self.batch_size).repeat())
# print(df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] }))
print('dataset: size={}, train={}, test={}'.format(dataset_size, self.TRAIN_SIZE, self.TEST_SIZE))
print('input_size={}'.format(features))
self.dataset_size = dataset_size
self.features = features
self.output_size = output_size
self.training = training
self.test = test
def create_model(self, layers, out_activation=None):
params = {
'kernel_initializer': 'lecun_uniform',
'bias_initializer': 'zeros',
# 'kernel_regularizer': keras.regularizers.l2(l=0.01)
}
dropout = [keras.layers.Dropout(0.1, input_shape=[self.features])]
# dropout = []
self.model = keras.Sequential(dropout + [
keras.layers.Dense(layers[0], activation=tf.nn.elu, **params)
] + [
keras.layers.Dense(n, activation=tf.nn.elu, **params) for n in layers[1:]
] + [
keras.layers.Dense(self.output_size, activation=out_activation, **params)
])
def compile(self, loss='mse', metrics=['accuracy'], optimizer=tf.train.AdamOptimizer, load_weights=True):
if load_weights:
self.model.load_weights(self.path)
optimizer = optimizer(self.learning_rate)
self.model.compile(loss=loss,
optimizer=optimizer,
metrics=metrics)
def evaluate(self):
return self.model.evaluate(
self.test,
batch_size=self.batch_size,
steps=int(self.dataset_size / self.batch_size),
verbose=1
)
def evaluate_print(self):
loss, accuracy = self.evaluate()
print('Test evaluation: loss: {}, accuracy: {}'.format(loss, accuracy))
def train(self):
self.model.summary()
checkpoint = keras.callbacks.ModelCheckpoint(self.path, monitor='val_loss', verbose=1, mode='min', save_best_only=True)
tensorboard = keras.callbacks.TensorBoard(log_dir='./logs', update_freq='epoch')
# reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
# map_callback = MapHistory()
self.model.fit(
self.training,
batch_size=self.batch_size,
epochs=self.epochs,
steps_per_epoch=int(self.TRAIN_SIZE / self.batch_size),
callbacks=[checkpoint, tensorboard],
validation_data=self.test,
validation_steps=int(self.TEST_SIZE / self.batch_size),
verbose=1
)
def predict(self, a):
return np.argmax(self.model.predict(a), axis=1)
A = Model('a', epochs=2)
B = Model('b', learning_rate=0.0005, epochs=50)
# 24 so far
def compile_b():
B.prepare_dataset(df, dataframe_to_dataset_biomes)
B.create_model([12], tf.nn.softmax)
B.compile(loss='sparse_categorical_crossentropy', load_weights=False)
def compile_a():
A.prepare_dataset(df, dataframe_to_dataset_temp_precip)
A.create_model([(4, tf.nn.elu)])
# A.create_model([]) # linear model
A.compile(metrics=['accuracy', 'mae'])
if __name__ == "__main__":
compile_b()
B.train()
# for inp, out in B.test.take(1).make_one_shot_iterator():
# print(inp, out)
# print(np.unique(nums))
# print(np.unique(predictions))
# print('loss: {}, evaluation: {}'.format(*B.evaluate()))
# compile_a()
# A.train()

28
plot.py
View File

@ -1,28 +0,0 @@
import geopandas
import os
import rasterio
import pandas as pd
from matplotlib import pyplot
directory = os.path.dirname(os.path.abspath(__file__))
GEODATA = os.path.join(directory, 'geodata')
ECOREGIONS = os.path.join(GEODATA, 'ecoregions', 'Ecoregions2017.shp')
ELEVATION = os.path.join(GEODATA, 'srtm', 'topo30-180.tif')
TEMP = os.path.join(GEODATA, 'air_temp')
temp = pd.read_csv(os.path.join(TEMP, 'air_temp.2017'), sep='\s+', header=None, names=['longitude', 'latitude', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'november', 'october', 'december', 'yearly_avg'])
print(temp.head())
eco = geopandas.read_file(ECOREGIONS)
elevation = rasterio.open(ELEVATION)
print(eco.head())
print(elevation)
eco.plot()
# rasterio.plot.show(src)
# pyplot.imshow(elevation.read(1))
pyplot.show()

View File

@ -1,22 +1,20 @@
import fire
import numpy as np import numpy as np
from utils import * from utils import *
from nn import B, compile_b #from nn import compile_b
from constants import INPUTS
from model import Model
from draw import draw from draw import draw
import time
def chunker(seq, size): def predicted_map(B, change=0, path=None):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
def predicted_map(path=None):
year = MAX_YEAR - 1 year = MAX_YEAR - 1
df = pd.read_pickle('data.p') df = pd.read_pickle('data.p')
print('TEMPERATURE MODIFICATION OF {}'.format(change)) logger.info('temperature change of %s', change)
inputs = ['elevation', 'distance_to_water', 'latitude'] inputs = list(INPUTS)
for season in SEASONS: for season in SEASONS:
inputs += [ inputs += [
@ -24,34 +22,37 @@ def predicted_map(path=None):
'precip_{}_{}'.format(season, year) 'precip_{}_{}'.format(season, year)
] ]
print(inputs)
# print(inputs)
frame = df[inputs + ['longitude']] frame = df[inputs + ['longitude']]
# print(frame.head()) frame_cp = df[inputs + ['longitude']]
for season in SEASONS: for season in SEASONS:
frame.loc[:, 'temp_{}_{}'.format(season, year)] += change frame.loc[:, 'temp_{}_{}'.format(season, year)] += change
columns = ['latitude', 'longitude', 'biome_num'] columns = ['latitude', 'longitude', 'biome_num']
new_data = pd.DataFrame(columns=columns) new_data = pd.DataFrame(columns=columns)
nframe = pd.DataFrame(columns=frame.columns, data=normalize_ndarray(frame.to_numpy(), frame_cp.to_numpy()))
for i, chunk in enumerate(chunker(frame, B.batch_size)): for i, (chunk, chunk_original) in enumerate(zip(chunker(nframe, B.batch_size), chunker(frame, B.batch_size))):
if chunk.shape[0] < B.batch_size: if chunk.shape[0] < B.batch_size:
continue continue
input_data = normalize_ndarray(chunk.loc[:, inputs].values) input_data = chunk.loc[:, inputs].values
out = B.predict(input_data) out = B.predict(input_data)
f = pd.DataFrame({ f = pd.DataFrame({
'longitude': chunk.loc[:, 'longitude'], 'longitude': chunk_original.loc[:, 'longitude'],
'latitude': chunk.loc[:, 'latitude'], 'latitude': chunk_original.loc[:, 'latitude'],
'biome_num': out 'biome_num': out
}, columns=columns) }, columns=columns)
new_data = new_data.append(f) new_data = new_data.append(f)
draw(new_data, path=path) draw(new_data, path=path)
if __name__ == "__main__": def predicted_map_cmd(checkpoint='checkpoints/save.h5', change=0, path=None):
compile_b() B = Model('b', epochs=1)
predicted_map() B.prepare_for_use()
B.restore(checkpoint)
predicted_map(B, change=change, path=path)
if __name__ == "__main__":
fire.Fire(predicted_map_cmd)

View File

@ -7,3 +7,7 @@ rasterio==1.0.15
tensorflow==1.13.1 tensorflow==1.13.1
Cartopy==0.17.0 Cartopy==0.17.0
numpy==1.16.1 numpy==1.16.1
scikit-learn==0.20.3
https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-manylinux1_x86_64.whl
fire==0.1.3
psutil==5.6.1

89
tracks
View File

@ -1,89 +0,0 @@
Layer (type) Output Shape Param #
=================================================================
Group 1
-----------------------------------------------------------------
dense (Dense) (None, 128) 1536
_________________________________________________________________
dense_1 (Dense) (None, 256) 33024
_________________________________________________________________
dense_2 (Dense) (None, 14) 3598
-----------------------------------------------------------------
Total params: 38,158
1 Epoch: loss: 0.3822 - acc: 0.8684
Learning rate: 0.005
=================================================================
Group 2
-----------------------------------------------------------------
dense (Dense) (None, 32) 384
_________________________________________________________________
dense_1 (Dense) (None, 64) 2112
_________________________________________________________________
dense_2 (Dense) (None, 32) 2080
_________________________________________________________________
dense_3 (Dense) (None, 14) 462
-----------------------------------------------------------------
Total params: 5,038
1 Epoch: loss: 0.3760 - acc: 0.8678 @ 20minutes
Stopped converging, loss increasing
Learning rate: 0.005
=================================================================
Group 3
-----------------------------------------------------------------
dense (Dense) (None, 16) 192
_________________________________________________________________
dense_1 (Dense) (None, 32) 544
_________________________________________________________________
dense_2 (Dense) (None, 16) 528
_________________________________________________________________
dense_3 (Dense) (None, 14) 238
-----------------------------------------------------------------
Total params: 1,502
1 Epoch: loss: 0.3702 - acc: 0.8671 @ 12minutes
10 Epochs: loss: 0.3280 - acc: 0.8815
Stopped converging after 5 epochs, was oscillating
Learning rate: 0.005
=================================================================
Group 4
_________________________________________________________________
dense (Dense) (None, 12) 144
_________________________________________________________________
dense_1 (Dense) (None, 14) 182
_________________________________________________________________
Total params: 326
1 Epoch: loss: 0.4412 - acc: 0.8457 @ 10m
60 Epochs: loss: 0.4146 - acc: 0.8546
Stopped converging
Learning rate: 0.005
=================================================================
Group 5
_________________________________________________________________
dense (Dense) (None, 12) 144
_________________________________________________________________
dense_1 (Dense) (None, 14) 182
_________________________________________________________________
Total params: 326
1 Epoch: loss: 0.5057 - acc: 0.8268 @ 10m
15 epoch: loss: 0.4240 - acc: 0.8481
Stopped converging
Learning rate: 0.001
=================================================================
Group 6
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 24) 288
_________________________________________________________________
dense_1 (Dense) (None, 14) 350
_________________________________________________________________
Total params: 638
1 Epoch: loss: 0.4520 - acc: 0.8416 @ 12m
30 epochs: loss: 0.3562 - acc: 0.8691, still converging
stopped converging after 100 epochs
Learning rate: 0.001

67
train.py Normal file
View File

@ -0,0 +1,67 @@
import fire
import ray
import pandas as pd
import tensorflow as tf
from ray import tune
from tensorflow import keras
from utils import logger
from model import Model
B_params = {
'batch_size': tune.grid_search([256]),
'layers': tune.grid_search([[512, 512]]),
'lr': tune.grid_search([1e-4]),
'optimizer': tune.grid_search([tf.keras.optimizers.Adam]),
}
df = pd.read_pickle('data.p')
class TuneB(tune.Trainable):
def _setup(self, config):
logger.debug('Ray Tune model configuration %s', config)
self.model = Model('b', epochs=1)
optimizer = config['optimizer']
optimizer = config['optimizer'](lr=config['lr'])
self.model.prepare_for_use(df=df, batch_size=config['batch_size'], layers=config['layers'], optimizer=optimizer)
def _train(self):
logs = self.model.train(self.config)
metrics = {
'mean_accuracy': logs.history['acc'][0],
'loss': logs.history['loss'][0],
'val_accuracy': logs.history['val_acc'][0],
'val_loss': logs.history['val_loss'][0],
}
return metrics
def _save(self, checkpoint_dir):
return self.model.save(checkpoint_dir)
def _restore(self, path):
return self.model.restore(path)
def start_tuning(cpu=1, gpu=2, checkpoint_freq=1, checkpoint_at_end=True, resume=False, restore=None, stop=500):
ray.init()
tune.run(TuneB,
config=B_params,
resources_per_trial={
"cpu": cpu,
"gpu": gpu
},
resume=resume,
checkpoint_at_end=checkpoint_at_end,
checkpoint_freq=checkpoint_freq,
restore=restore,
stop={
'training_iteration': stop
})
if __name__ == "__main__":
fire.Fire(start_tuning)

View File

@ -1,55 +1,46 @@
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
import pandas as pd import pandas as pd
from collections import Counter
from sklearn.utils import class_weight
from constants import * from constants import *
import logging
import os
inputs = ['elevation', 'distance_to_water', 'latitude'] logger = logging.getLogger('main')
output = 'biome_num' logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
def normalize(v):
return (v - np.mean(v)) / np.std(v)
def normalize_ndarray(ar): def normalize(v, o=None):
if o is None:
o = v
return (v - np.mean(o)) / np.std(o)
def normalize_ndarray(ar, o=None):
if o is None:
o = ar
# transpose: operate over columns
tr = np.transpose(ar) tr = np.transpose(ar)
to = np.transpose(o)
for i in range(tr.shape[0]): for i in range(tr.shape[0]):
tr[i] = normalize(tr[i]) tr[i] = normalize(tr[i], to[i])
# transpose back
return np.transpose(tr) return np.transpose(tr)
def normalize_df(df):
for col in df.columns:
df.loc[col] = normalize_ndarray(df[col])
return df
def dataframe_to_dataset_biomes(df): def dataframe_to_dataset_biomes(df):
rows = df.shape[0] rows = df.shape[0]
# 8 for seasonal temp and precipitation # 8 for seasonal temp and precipitation
# 3 for latitude, elevation and distance_to_water # 3 for latitude, elevation and distance_to_water
columns = 11 input_columns = 11
# make biomes uniformly distributed so each biome has enough data to avoid a biased dataset tf_inputs = np.empty((0, input_columns))
biome_shares = df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] })
max_share = np.max(biome_shares['biome_num'])
dsize = df.shape[0]
max_share_count = int(max_share * dsize)
for biome_num in biome_shares.index:
share = biome_shares.values[biome_num][0]
share_count = int(share * dsize)
diff = max_share_count - share_count
rows = df.loc[df['biome_num'] == biome_num]
diff_ratio = int(diff / rows.shape[0])
df = pd.concat([df] + [rows] * diff_ratio, ignore_index=True)
# print(df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] }))
tf_inputs = np.empty((0, columns))
tf_output = np.empty((0)) tf_output = np.empty((0))
for year in range(MIN_YEAR, MAX_YEAR + 1): for year in range(MIN_YEAR, MAX_YEAR + 1):
local_inputs = list(inputs) local_inputs = list(INPUTS)
for season in SEASONS: for season in SEASONS:
local_inputs += [ local_inputs += [
'temp_{}_{}'.format(season, year), 'temp_{}_{}'.format(season, year),
@ -60,25 +51,32 @@ def dataframe_to_dataset_biomes(df):
local_df = df[local_inputs] local_df = df[local_inputs]
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0) tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
tf_output = np.concatenate((tf_output, df[output].values), axis=0) tf_output = np.concatenate((tf_output, df[OUTPUT].values), axis=0)
# balance class weights for the loss function, since the data is highly unbalanced
num_classes = len(np.unique(tf_output))
class_weights = class_weight.compute_class_weight('balanced', np.unique(tf_output), tf_output)
logger.debug('class_weights %s', class_weights)
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32) tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
tf_output = tf.cast(tf_output, tf.int64) tf_output = tf.cast(tf_output, tf.int64)
return int(tf_inputs.shape[0]), 11, 14, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes)
return int(tf_inputs.shape[0]), input_columns, num_classes, class_weights, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
def dataframe_to_dataset_temp_precip(df): def dataframe_to_dataset_temp_precip(df):
rows = df.shape[0] rows = df.shape[0]
# elevation, distance_to_water, latitude # elevation, distance_to_water, latitude
# season, year # season, year
columns = 5 input_columns = 5
num_classes = 2
tf_inputs = np.empty((0, columns)) tf_inputs = np.empty((0, input_columns))
tf_output = np.empty((0, 2)) tf_output = np.empty((0, num_classes))
for year in range(MIN_YEAR, MAX_YEAR + 1): for year in range(MIN_YEAR, MAX_YEAR + 1):
local_inputs = list(inputs) local_inputs = list(INPUTS)
for idx, season in enumerate(SEASONS): for idx, season in enumerate(SEASONS):
season_index = idx / len(season) season_index = idx / len(season)
@ -93,8 +91,11 @@ def dataframe_to_dataset_temp_precip(df):
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32) tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
tf_output = tf.cast(tf_output, tf.float32) tf_output = tf.cast(tf_output, tf.float32)
return int(tf_inputs.shape[0]), 5, 2, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes)
return int(tf_inputs.shape[0]), input_columns, num_classes, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
# df = pd.read_pickle('data.p') flatten = lambda l: [item for sublist in l for item in sublist]
# print(dataframe_to_dataset_biomes(df))
def chunker(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))