chore(biomes): move biomes to /biomes

This commit is contained in:
Mahdi Dibaiee
2019-04-09 08:20:32 +04:30
parent e977239027
commit e29d461319
9 changed files with 0 additions and 0 deletions

8
biomes/INSTALL.md Normal file
View File

@@ -0,0 +1,8 @@
```
pyenv install $(cat .python-version)
pyenv local
pip install -r requirements.txt
apt install proj-bin libproj-dev # https://proj4.org/install.html#install
apt install libgeos-3.6.2 libgeos-dev libgeos++-dev # https://packages.ubuntu.com/search?keywords=geos&searchon=sourcenames&suite=all&section=all
```

81
biomes/constants.py Normal file
View File

@@ -0,0 +1,81 @@
import os
directory = os.path.dirname(os.path.abspath(__file__))
GEODATA = os.path.join(directory, 'geodata')
ECOREGIONS = os.path.join(GEODATA, 'ecoregions', 'single-parts.shp')
ELEVATION = os.path.join(GEODATA, 'srtm', 'topo30-180.tif')
TEMP = os.path.join(GEODATA, 'air_temp')
PRECIP = os.path.join(GEODATA, 'precipitation')
MIN_YEAR = 1900
MAX_YEAR = 2017
SEASONS = ['winter', 'spring', 'summer', 'autumn']
WINTER_MONTHS = ['december', 'january', 'february']
SPRING_MONTHS = ['march', 'april', 'may']
SUMMER_MONTHS = ['june', 'july', 'august']
AUTUMN_MONTHS = ['september', 'november', 'october']
INPUTS = ['elevation', 'distance_to_water', 'latitude']
OUTPUT = 'biome_num'
BIOMES = [
{
'name': 'Tropical & Subtropical Moist Broadleaf Forests',
'color': '#016936',
},
{
'name': 'Tropical & Subtropical Dry Broadleaf Forests',
'color': '#B2D127',
},
{
'name': 'Tropical & Subtropical Coniferous Forests',
'color': '#77CC00',
},
{
'name': 'Temperate Broadleaf & Mixed Forests',
'color': '#99C500',
},
{
'name': 'Temperate Conifer Forests',
'color': '#B6CC00',
},
{
'name': 'Boreal Forests/Taiga',
'color': '#00C5B5',
},
{
'name': 'Tropical & Subtropical Grasslands, Savannas & Shrublands',
'color': '#EFFF00',
},
{
'name': 'Temperate Grasslands, Savannas & Shrublands',
'color': '#FFEE00',
},
{
'name': 'Flooded Grasslands & Savannas',
'color': '#009BFF',
},
{
'name': 'Montane Grasslands & Shrublands',
'color': '#A0ADBA',
},
{
'name': 'Tundra',
'color': '#5C62FF',
},
{
'name': 'Mediterranean Forests, Woodlands & Scrub',
'color': '#00850F',
},
{
'name': 'Deserts & Xeric Shrublands',
'color': '#FF9E1F',
},
{
'name': 'Mangroves',
'color': '#FF1F97'
}
]

137
biomes/data.py Normal file
View File

@@ -0,0 +1,137 @@
import geopandas
import rasterio
import pandas as pd
import numpy as np
import time
from matplotlib import pyplot
from shapely.geometry import Point
from constants import *
def read_temp_data(year):
return pd.read_csv(os.path.join(TEMP, 'air_temp.{}'.format(year)), sep='\s+', header=None,
names=['longitude', 'latitude', 'january',
'february', 'march', 'april',
'may', 'june', 'july', 'august',
'september', 'november', 'october',
'december', 'yearly_avg'])
def read_precip_data(year):
return pd.read_csv(os.path.join(PRECIP, 'precip.{}'.format(year)), sep='\s+', header=None,
names=['longitude', 'latitude', 'january',
'february', 'march', 'april',
'may', 'june', 'july', 'august',
'september', 'november', 'october',
'december', 'yearly_avg'])
eco = geopandas.read_file(ECOREGIONS)
elevation = rasterio.open(ELEVATION)
elevation_data = elevation.read(1)
temp = {}
precip = {}
for year in range(MIN_YEAR, MAX_YEAR + 1):
temp[year] = read_temp_data(year)
precip[year] = read_precip_data(year)
precip[year]['yearly_avg'] = precip[year].mean(axis=1)
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))[['geometry']].unary_union
boundary = world.boundary
temp_precip_columns = []
for year in range(MIN_YEAR, MAX_YEAR + 1):
for s in SEASONS:
temp_precip_columns += ['temp_{}_{}'.format(s, year), 'precip_{}_{}'.format(s, year)]
columns = ['longitude', 'latitude', 'biome_num', 'biome_name', 'elevation', 'distance_to_water'] + temp_precip_columns
final_data = pd.DataFrame(columns=columns)
def get_point_information(longitude, latitude):
item = {}
p = Point(longitude, latitude)
ecoregion = eco.loc[lambda c: c.geometry.contains(p)]
if ecoregion.empty:
return False
item['longitude'] = longitude
item['latitude'] = latitude
item['biome_num'] = ecoregion.BIOME_NUM.iloc[0]
item['biome_name'] = ecoregion.BIOME_NAME.iloc[0]
elev = elevation_data[elevation.index(longitude, latitude)]
item['elevation'] = elev
distance_to_sea = p.distance(boundary)
item['distance_to_water'] = distance_to_sea
t = np.argmin(np.array((temp[MIN_YEAR].longitude - longitude)**2 + (temp[MIN_YEAR].latitude - latitude)**2))
p = np.argmin(np.array((precip[MIN_YEAR].longitude - longitude)**2 + (precip[MIN_YEAR].latitude - latitude)**2))
yearly_temp = {}
yearly_precip = {}
for year in range(MIN_YEAR, MAX_YEAR + 1):
yearly_temp[year] = yt = temp[year].iloc[t, 2:]
yearly_precip[year] = yp = precip[year].iloc[p, 2:]
winter_temp = [yt.january, yt.february] + ([yearly_temp[year - 1].december] if year > MIN_YEAR else [])
winter_precip = [yp.january, yp.february] + ([yearly_precip[year - 1].december] if year > MIN_YEAR else [])
spring_temp = [yt[month] for month in SPRING_MONTHS]
spring_precip = [yp[month] for month in SPRING_MONTHS]
summer_temp = [yt[month] for month in SUMMER_MONTHS]
summer_precip = [yp[month] for month in SUMMER_MONTHS]
autumn_temp = [yt[month] for month in AUTUMN_MONTHS]
autumn_precip = [yp[month] for month in AUTUMN_MONTHS]
item['temp_winter_{}'.format(year)] = np.mean(winter_temp)
item['precip_winter_{}'.format(year)] = np.mean(winter_precip)
item['temp_spring_{}'.format(year)] = np.mean(spring_temp)
item['precip_spring_{}'.format(year)] = np.mean(spring_precip)
item['temp_summer_{}'.format(year)] = np.mean(summer_temp)
item['precip_summer_{}'.format(year)] = np.mean(summer_precip)
item['temp_autumn_{}'.format(year)] = np.mean(autumn_temp)
item['precip_autumn_{}'.format(year)] = np.mean(autumn_precip)
return item
data = {}
for col in columns:
data[col] = []
# i = 0
start_time = time.time()
for longitude in range(-179, 179):
print('-', end='')
for latitude in range(-89, 89):
# generate data and save to file
d = get_point_information(longitude, latitude)
if d == False:
print('.', end='')
continue
for key, value in d.items():
data[key].append(value)
print('+', end='')
print('')
print("--- Calculations: %s seconds ---" % (time.time() - start_time))
start_time = time.time()
df = pd.DataFrame(data)
print("--- Generating DataFrame: %s seconds ---" % (time.time() - start_time))
print(df)
start_time = time.time()
df.to_pickle('data.p')
print("--- Pickling DataFrame: %s seconds ---" % (time.time() - start_time))

51
biomes/draw.py Normal file
View File

@@ -0,0 +1,51 @@
import fire
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Circle, Patch
from utils import logger
from constants import BIOMES
import pandas as pd
import cartopy.crs as ccrs
def draw(df, path=None):
logger.debug('draw(df, %s)', path)
biomes = {}
biome_numbers = df['biome_num'].unique()
for i, row in df.iterrows():
p = (row.longitude, row.latitude)
if row.biome_num in biomes:
biomes[row.biome_num].append(p)
else:
biomes[row.biome_num] = [p]
ax = plt.axes(projection=ccrs.PlateCarree())
ax.stock_img()
legend_handles = []
for n in biome_numbers:
color = BIOMES[n]['color']
patches = [Circle(p, radius=0.4) for p in biomes[n]]
collection = PatchCollection(patches, color=color)
legend_handles.append(Patch(color=color, label=BIOMES[n]['name']))
ax.add_collection(collection)
ax.legend(handles=legend_handles, loc='center left', bbox_to_anchor=(1, 0.5), markerscale=4)
ax.autoscale_view()
figure = plt.gcf()
figure.set_size_inches(23.22, 13)
figure.subplots_adjust(left=0.02, right=0.79)
if path:
plt.savefig(path)
else:
plt.show()
def draw_cmd(path=None):
draw(pd.read_pickle('data.p'), path=path)
if __name__ == "__main__":
fire.Fire(draw_cmd)

144
biomes/model.py Normal file
View File

@@ -0,0 +1,144 @@
from __future__ import absolute_import, division, print_function
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras
# Helper libraries
import numpy as np
import pandas as pd
from utils import *
RANDOM_SEED = 1
logger.debug('Tensorflow version: %s', tf.__version__)
logger.debug('Random Seed: %s', RANDOM_SEED)
tf.set_random_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
DEFAULT_BATCH_SIZE=256
DEFAULT_LAYERS = [512, 512]
DEFAULT_BUFFER_SIZE=500
DEFAULT_OUT_ACTIVATION = tf.nn.softmax
DEFAULT_LOSS = 'sparse_categorical_crossentropy'
DEFAULT_OPTIMIZER = tf.keras.optimizers.Adam(lr=0.001)
class Model():
def __init__(self, name, epochs=1):
self.name = name
self.path = "checkpoints/{}.hdf5".format(name)
self.epochs = epochs
def prepare_dataset(self, df, fn, **kwargs):
self.dataset_fn = fn
self.set_dataset(*fn(df), **kwargs)
def set_dataset(self, dataset_size, features, output_size, class_weight, dataset, shuffle_buffer_size=DEFAULT_BUFFER_SIZE, batch_size=DEFAULT_BATCH_SIZE):
self.shuffle_buffer_size = shuffle_buffer_size
self.class_weight = class_weight
self.dataset = dataset.shuffle(self.shuffle_buffer_size)
self.TRAIN_SIZE = int(dataset_size * 0.85)
self.TEST_SIZE = dataset_size - self.TRAIN_SIZE
(training, test) = (self.dataset.take(self.TRAIN_SIZE),
self.dataset.skip(self.TRAIN_SIZE))
logger.debug('Model dataset info: size=%s, train=%s, test=%s', dataset_size, self.TRAIN_SIZE, self.TEST_SIZE)
self.dataset_size = dataset_size
self.features = features
self.output_size = output_size
self.training = training
self.test = test
logger.debug('Model input size: %s', self.features)
logger.debug('Model output size: %s', self.output_size)
self.batch_size = batch_size
self.training_batched = self.training.batch(self.batch_size).repeat()
self.test_batched = self.test.batch(self.batch_size).repeat()
def create_model(self, layers=DEFAULT_LAYERS, out_activation=DEFAULT_OUT_ACTIVATION):
params = {
'kernel_initializer': 'lecun_uniform',
'bias_initializer': 'zeros',
# 'kernel_regularizer': keras.regularizers.l2(l=0.01)
'input_shape': [self.features]
}
activation = tf.nn.elu
logger.debug('Model layer parameters: %s', params)
logger.debug('Model layer sizes: %s', layers)
logger.debug('Model layer activation function: %s', activation)
logger.debug('Model out activation function: %s', out_activation)
self.model = keras.Sequential([
keras.layers.Dense(n, activation=activation, **params) for n in layers
] + [
keras.layers.Dense(self.output_size, activation=out_activation, **params)
])
def compile(self, loss=DEFAULT_LOSS, metrics=['accuracy'], optimizer=DEFAULT_OPTIMIZER):
logger.debug('Model loss function: %s', loss)
logger.debug('Model optimizer: %s', optimizer)
logger.debug('Model metrics: %s', metrics)
self.model.compile(loss=loss,
optimizer=optimizer,
metrics=metrics)
def restore(self, path):
logger.debug('Restoring model weights from path: %s', path)
return self.model.load_weights(path)
def save(self, path):
logger.debug('Saving model weights to path: %s', path)
self.model.save_weights(path)
return path
def evaluate(self):
return self.model.evaluate(
self.test,
batch_size=self.batch_size,
steps=int(self.dataset_size / self.batch_size),
verbose=1
)
def evaluate_print(self):
loss, accuracy = self.evaluate()
print('Test evaluation: loss: {}, accuracy: {}'.format(loss, accuracy))
def train(self, config):
self.model.summary()
# map_callback = MapHistory()
out = self.model.fit(
self.training_batched,
batch_size=self.batch_size,
epochs=self.epochs,
steps_per_epoch=int(self.TRAIN_SIZE / self.batch_size),
class_weight=self.class_weight,
validation_data=self.test_batched,
validation_steps=int(self.TEST_SIZE / self.batch_size),
verbose=1
)
return out
def predict(self, a):
return np.argmax(self.model.predict(a), axis=1)
def prepare_for_use(self, df=None, batch_size=DEFAULT_BUFFER_SIZE, layers=DEFAULT_LAYERS, out_activation=DEFAULT_OUT_ACTIVATION, loss=DEFAULT_LOSS, optimizer=DEFAULT_OPTIMIZER):
if df is None:
df = pd.read_pickle('data.p')
self.prepare_dataset(df, dataframe_to_dataset_biomes, batch_size=batch_size)
self.create_model(layers=layers, out_activation=out_activation)
self.compile(loss=loss, optimizer=optimizer)

58
biomes/predict.py Normal file
View File

@@ -0,0 +1,58 @@
import fire
import numpy as np
from utils import *
#from nn import compile_b
from constants import INPUTS
from model import Model
from draw import draw
def predicted_map(B, change=0, path=None):
year = MAX_YEAR - 1
df = pd.read_pickle('data.p')
logger.info('temperature change of %s', change)
inputs = list(INPUTS)
for season in SEASONS:
inputs += [
'temp_{}_{}'.format(season, year),
'precip_{}_{}'.format(season, year)
]
frame = df[inputs + ['longitude']]
frame_cp = df[inputs + ['longitude']]
for season in SEASONS:
frame.loc[:, 'temp_{}_{}'.format(season, year)] += change
columns = ['latitude', 'longitude', 'biome_num']
new_data = pd.DataFrame(columns=columns)
nframe = pd.DataFrame(columns=frame.columns, data=normalize_ndarray(frame.to_numpy(), frame_cp.to_numpy()))
for i, (chunk, chunk_original) in enumerate(zip(chunker(nframe, B.batch_size), chunker(frame, B.batch_size))):
if chunk.shape[0] < B.batch_size:
continue
input_data = chunk.loc[:, inputs].values
out = B.predict(input_data)
f = pd.DataFrame({
'longitude': chunk_original.loc[:, 'longitude'],
'latitude': chunk_original.loc[:, 'latitude'],
'biome_num': out
}, columns=columns)
new_data = new_data.append(f)
draw(new_data, path=path)
def predicted_map_cmd(checkpoint='checkpoints/save.h5', change=0, path=None):
B = Model('b', epochs=1)
B.prepare_for_use()
B.restore(checkpoint)
predicted_map(B, change=change, path=path)
if __name__ == "__main__":
fire.Fire(predicted_map_cmd)

13
biomes/requirements.txt Normal file
View File

@@ -0,0 +1,13 @@
geopandas==0.4.0
geopy==0.99
matplotlib==3.0.2
descartes==1.1.0
pysal==2.0.0
rasterio==1.0.15
tensorflow==1.13.1
Cartopy==0.17.0
numpy==1.16.1
scikit-learn==0.20.3
https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-manylinux1_x86_64.whl
fire==0.1.3
psutil==5.6.1

67
biomes/train.py Normal file
View File

@@ -0,0 +1,67 @@
import fire
import ray
import pandas as pd
import tensorflow as tf
from ray import tune
from tensorflow import keras
from utils import logger
from model import Model
B_params = {
'batch_size': tune.grid_search([256]),
'layers': tune.grid_search([[512, 512]]),
'lr': tune.grid_search([1e-4]),
'optimizer': tune.grid_search([tf.keras.optimizers.Adam]),
}
df = pd.read_pickle('data.p')
class TuneB(tune.Trainable):
def _setup(self, config):
logger.debug('Ray Tune model configuration %s', config)
self.model = Model('b', epochs=1)
optimizer = config['optimizer']
optimizer = config['optimizer'](lr=config['lr'])
self.model.prepare_for_use(df=df, batch_size=config['batch_size'], layers=config['layers'], optimizer=optimizer)
def _train(self):
logs = self.model.train(self.config)
metrics = {
'mean_accuracy': logs.history['acc'][0],
'loss': logs.history['loss'][0],
'val_accuracy': logs.history['val_acc'][0],
'val_loss': logs.history['val_loss'][0],
}
return metrics
def _save(self, checkpoint_dir):
return self.model.save(checkpoint_dir)
def _restore(self, path):
return self.model.restore(path)
def start_tuning(cpu=1, gpu=2, checkpoint_freq=1, checkpoint_at_end=True, resume=False, restore=None, stop=500):
ray.init()
tune.run(TuneB,
config=B_params,
resources_per_trial={
"cpu": cpu,
"gpu": gpu
},
resume=resume,
checkpoint_at_end=checkpoint_at_end,
checkpoint_freq=checkpoint_freq,
restore=restore,
stop={
'training_iteration': stop
})
if __name__ == "__main__":
fire.Fire(start_tuning)

101
biomes/utils.py Normal file
View File

@@ -0,0 +1,101 @@
import numpy as np
import tensorflow as tf
import pandas as pd
from collections import Counter
from sklearn.utils import class_weight
from constants import *
import logging
import os
logger = logging.getLogger('main')
logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
def normalize(v, o=None):
if o is None:
o = v
return (v - np.mean(o)) / np.std(o)
def normalize_ndarray(ar, o=None):
if o is None:
o = ar
# transpose: operate over columns
tr = np.transpose(ar)
to = np.transpose(o)
for i in range(tr.shape[0]):
tr[i] = normalize(tr[i], to[i])
# transpose back
return np.transpose(tr)
def dataframe_to_dataset_biomes(df):
rows = df.shape[0]
# 8 for seasonal temp and precipitation
# 3 for latitude, elevation and distance_to_water
input_columns = 11
tf_inputs = np.empty((0, input_columns))
tf_output = np.empty((0))
for year in range(MIN_YEAR, MAX_YEAR + 1):
local_inputs = list(INPUTS)
for season in SEASONS:
local_inputs += [
'temp_{}_{}'.format(season, year),
'precip_{}_{}'.format(season, year)
]
local_df = df[local_inputs]
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
tf_output = np.concatenate((tf_output, df[OUTPUT].values), axis=0)
# balance class weights for the loss function, since the data is highly unbalanced
num_classes = len(np.unique(tf_output))
class_weights = class_weight.compute_class_weight('balanced', np.unique(tf_output), tf_output)
logger.debug('class_weights %s', class_weights)
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
tf_output = tf.cast(tf_output, tf.int64)
logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes)
return int(tf_inputs.shape[0]), input_columns, num_classes, class_weights, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
def dataframe_to_dataset_temp_precip(df):
rows = df.shape[0]
# elevation, distance_to_water, latitude
# season, year
input_columns = 5
num_classes = 2
tf_inputs = np.empty((0, input_columns))
tf_output = np.empty((0, num_classes))
for year in range(MIN_YEAR, MAX_YEAR + 1):
local_inputs = list(INPUTS)
for idx, season in enumerate(SEASONS):
season_index = idx / len(season)
local_df = df[local_inputs]
local_df.loc[:, 'season'] = pd.Series(np.repeat(season_index, rows), index=local_df.index)
local_df.loc[:, 'year'] = pd.Series(np.repeat(year, rows), index=local_df.index)
output = ['temp_{}_{}'.format(season, year), 'precip_{}_{}'.format(season, year)]
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
tf_output = np.concatenate((tf_output, df[output].values), axis=0)
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
tf_output = tf.cast(tf_output, tf.float32)
logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes)
return int(tf_inputs.shape[0]), input_columns, num_classes, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
flatten = lambda l: [item for sublist in l for item in sublist]
def chunker(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))