refactor(data): include latitude longitude in columns, not indices

This commit is contained in:
Mahdi Dibaiee 2019-03-05 11:29:30 +03:30
parent 865cc775ed
commit 3dcafddb8c
12 changed files with 220 additions and 42 deletions

1
.floydexpt Normal file
View File

@ -0,0 +1 @@
{"name": "world", "namespace": "mdibaiee", "family_id": "prj_HzeYYJXLyy2otH6W"}

19
.floydignore Normal file
View File

@ -0,0 +1,19 @@
maps
logs
checkpoints.*
geodata
*.p
# Directories and files to ignore when uploading code to floyd
.git
.eggs
eggs
lib
lib64
parts
sdist
var
*.pyc
*.swp
.DS_Store

Binary file not shown.

19
data.py
View File

@ -46,9 +46,8 @@ for year in range(MIN_YEAR, MAX_YEAR + 1):
for s in SEASONS: for s in SEASONS:
temp_precip_columns += ['temp_{}_{}'.format(s, year), 'precip_{}_{}'.format(s, year)] temp_precip_columns += ['temp_{}_{}'.format(s, year), 'precip_{}_{}'.format(s, year)]
columns = ['biome_num', 'biome_name', 'elevation', 'distance_to_water'] + temp_precip_columns columns = ['longitude', 'latitude', 'biome_num', 'biome_name', 'elevation', 'distance_to_water'] + temp_precip_columns
indices = ['longitude', 'latitude'] final_data = pd.DataFrame(columns=columns)
final_data = pd.DataFrame(index=indices, columns=columns)
def get_point_information(longitude, latitude): def get_point_information(longitude, latitude):
item = {} item = {}
@ -57,6 +56,8 @@ def get_point_information(longitude, latitude):
if ecoregion.empty: if ecoregion.empty:
return False return False
item['longitude'] = longitude
item['latitude'] = latitude
item['biome_num'] = ecoregion.BIOME_NUM.iloc[0] item['biome_num'] = ecoregion.BIOME_NUM.iloc[0]
item['biome_name'] = ecoregion.BIOME_NAME.iloc[0] item['biome_name'] = ecoregion.BIOME_NAME.iloc[0]
@ -100,18 +101,18 @@ def get_point_information(longitude, latitude):
return item return item
data_indices = [] data = {}
data_map = {}
for col in columns: for col in columns:
data_map[col] = {} data[col] = []
i = 0 # i = 0
start_time = time.time() start_time = time.time()
for longitude in range(-179, 179): for longitude in range(-179, 179):
print('-', end='') print('-', end='')
for latitude in range(-89, 89): for latitude in range(-89, 89):
# generate data and save to file # generate data and save to file
d = get_point_information(longitude, latitude) d = get_point_information(longitude, latitude)
if d == False: if d == False:
@ -119,7 +120,7 @@ for longitude in range(-179, 179):
continue continue
for key, value in d.items(): for key, value in d.items():
data_map[key][(longitude, latitude)] = value data[key].append(value)
print('+', end='') print('+', end='')
@ -128,7 +129,7 @@ for longitude in range(-179, 179):
print("--- Calculations: %s seconds ---" % (time.time() - start_time)) print("--- Calculations: %s seconds ---" % (time.time() - start_time))
start_time = time.time() start_time = time.time()
df = pd.DataFrame(data_map) df = pd.DataFrame(data)
print("--- Generating DataFrame: %s seconds ---" % (time.time() - start_time)) print("--- Generating DataFrame: %s seconds ---" % (time.time() - start_time))
print(df) print(df)
start_time = time.time() start_time = time.time()

19
demo.py Normal file
View File

@ -0,0 +1,19 @@
import pandas as pd
from utils import *
df = pd.read_pickle('data_final.p')
df.to_csv('data_final.csv')
print('DataFrame:')
print(df)
dataset_size, features, output_size, _ = dataframe_to_dataset_biomes(df)
print('Biomes dataset:\n - size: {}\n - inputs: {}\n - outputs: {}\n'.format(dataset_size, features, output_size))
dataset_size, features, output_size, _ = dataframe_to_dataset_temp_precip(df)
print('Temp/Precip dataset:\n - size: {}\n - inputs: {}\n - outputs: {}\n'.format(dataset_size, features, output_size))
# print('Normalized Data:')
# print(normalize_df(df))
# normalize_df(df).to_csv('data_normalized.csv')

View File

@ -10,8 +10,8 @@ def draw(df, path=None):
biome_numbers = df['biome_num'].unique() biome_numbers = df['biome_num'].unique()
# biome_names = df['biome_name'].unique() # biome_names = df['biome_name'].unique()
for (longitude, latitude), row in df.iterrows(): for i, row in df.iterrows():
p = Point(longitude, latitude) p = Point(row.longitude, row.latitude)
if row.biome_num in biomes: if row.biome_num in biomes:
biomes[row.biome_num].append(p) biomes[row.biome_num].append(p)
else: else:
@ -55,5 +55,5 @@ def draw(df, path=None):
plt.show() plt.show()
if __name__ == "__main__": if __name__ == "__main__":
df = pd.read_pickle('data_final.p') df = pd.read_pickle('data.p')
draw(df) draw(df)

23
floyd.yml Normal file
View File

@ -0,0 +1,23 @@
# see: https://docs.floydhub.com/floyd_config
# All supported configs:
#
#machine: cpu
#env: tensorflow-1.8
#input:
# - destination: input
# source: foo/datasets/yelp-food/1
# - foo/datasets/yelp-food-test/1:test
#description: this is a test
#max_runtime: 3600
#command: python train.py
# You can also define multiple tasks to use with --task argument:
#
#task:
# evaluate:
# machine: gpu
# command: python evaluate.py
#
# serve:
# machine: cpu
# mode: serve

25
nn.py
View File

@ -14,12 +14,14 @@ from utils import *
RANDOM_SEED = 1 RANDOM_SEED = 1
tf.enable_eager_execution() print(tf.__version__)
# tf.enable_eager_execution()
tf.set_random_seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED)
df = pd.read_pickle('data_final.p') df = pd.read_pickle('data.p')
class Model(): class Model():
def __init__(self, name, batch_size=16, shuffle_buffer_size=500, learning_rate=0.001, epochs=1): def __init__(self, name, batch_size=16, shuffle_buffer_size=500, learning_rate=0.001, epochs=1):
@ -40,17 +42,21 @@ class Model():
(training, test) = (self.dataset.take(self.TRAIN_SIZE).batch(self.batch_size).repeat(), (training, test) = (self.dataset.take(self.TRAIN_SIZE).batch(self.batch_size).repeat(),
self.dataset.skip(self.TRAIN_SIZE).batch(self.batch_size).repeat()) self.dataset.skip(self.TRAIN_SIZE).batch(self.batch_size).repeat())
print('dataset: size={}, train={}, test={}'.format(dataset_size, self.TRAIN_SIZE, self.TEST_SIZE))
print('input_size={}'.format(features))
self.dataset_size = dataset_size self.dataset_size = dataset_size
self.features = features self.features = features
self.output_size = output_size self.output_size = output_size
self.training = training self.training = training
self.test = test self.test = test
def create_model(self, layers, out_activation): def create_model(self, layers, out_activation=None):
params = { params = {
'kernel_initializer': 'lecun_uniform', 'kernel_initializer': 'lecun_uniform',
'bias_initializer': 'zeros', 'bias_initializer': 'zeros',
} }
# dropout = keras.layersDropout(0.2, input_shape=[self.features])
self.model = keras.Sequential([ self.model = keras.Sequential([
keras.layers.Dense(layers[0], activation=tf.nn.elu, input_shape=[self.features], **params) keras.layers.Dense(layers[0], activation=tf.nn.elu, input_shape=[self.features], **params)
] + [ ] + [
@ -60,7 +66,7 @@ class Model():
]) ])
def compile(self, loss='mse', metrics=['accuracy'], optimizer=tf.train.AdamOptimizer): def compile(self, loss='mse', metrics=['accuracy'], optimizer=tf.train.AdamOptimizer):
# self.model.load_weights(self.path) self.model.load_weights(self.path)
optimizer = optimizer(self.learning_rate) optimizer = optimizer(self.learning_rate)
self.model.compile(loss=loss, self.model.compile(loss=loss,
@ -79,7 +85,7 @@ class Model():
self.model.summary() self.model.summary()
checkpoint = keras.callbacks.ModelCheckpoint(self.path, monitor='acc', verbose=1, mode='max') checkpoint = keras.callbacks.ModelCheckpoint(self.path, monitor='acc', verbose=1, mode='max')
tensorboard = keras.callbacks.TensorBoard(log_dir='./logs') tensorboard = keras.callbacks.TensorBoard(log_dir='./logs', update_freq='epoch')
# map_callback = keras.callbacks.LambdaCallback(on_epoch_end=self.map_callback) # map_callback = keras.callbacks.LambdaCallback(on_epoch_end=self.map_callback)
self.model.fit( self.model.fit(
@ -95,16 +101,17 @@ class Model():
return np.argmax(self.model.predict(a), axis=1) return np.argmax(self.model.predict(a), axis=1)
A = Model('a', epochs=2) A = Model('a', epochs=2)
B = Model('b', learning_rate=0.005, epochs=100) B = Model('b', learning_rate=0.001, epochs=450)
def compile_b(): def compile_b():
B.prepare_dataset(df, dataframe_to_dataset_biomes) B.prepare_dataset(df, dataframe_to_dataset_biomes)
B.create_model([64, 128], tf.nn.softmax) B.create_model([32], tf.nn.softmax)
B.compile(loss='sparse_categorical_crossentropy') B.compile(loss='sparse_categorical_crossentropy')
def compile_a(): def compile_a():
A.prepare_dataset(df, dataframe_to_dataset_temp_precip) A.prepare_dataset(df, dataframe_to_dataset_temp_precip)
A.create_model([(4, tf.nn.elu)]) A.create_model([(4, tf.nn.elu)])
# A.create_model([]) # linear model
A.compile(metrics=['accuracy', 'mae']) A.compile(metrics=['accuracy', 'mae'])
if __name__ == "__main__": if __name__ == "__main__":
@ -118,5 +125,5 @@ if __name__ == "__main__":
# print(np.unique(predictions)) # print(np.unique(predictions))
# print('loss: {}, evaluation: {}'.format(*B.evaluate())) # print('loss: {}, evaluation: {}'.format(*B.evaluate()))
compile_a() # compile_a()
A.train() # A.train()

View File

@ -1,7 +1,7 @@
import numpy as np import numpy as np
from utils import * from utils import *
from nn import B from nn import B, compile_b
from draw import draw from draw import draw
import time import time
@ -10,16 +10,14 @@ def chunker(seq, size):
year = MAX_YEAR - 1 year = MAX_YEAR - 1
df = pd.read_pickle('data_final.p') df = pd.read_pickle('data.p')
latitude = np.array(df.index.get_level_values(1))
df.loc[:, 'latitude'] = pd.Series(latitude, index=df.index)
compile_b() compile_b()
for change in range(0, 1): for change in range(0, 1):
print('TEMPERATURE MODIFICATION OF {}'.format(change)) print('TEMPERATURE MODIFICATION OF {}'.format(change))
inputs = ['elevation', 'distance_to_water'] inputs = ['latitude', 'longitude', 'elevation', 'distance_to_water']
for season in SEASONS: for season in SEASONS:
inputs += [ inputs += [
@ -27,22 +25,28 @@ for change in range(0, 1):
'precip_{}_{}'.format(season, year) 'precip_{}_{}'.format(season, year)
] ]
inputs += ['latitude'] # print(inputs)
frame = df[inputs] frame = df[inputs]
print(frame.head()) # print(frame.head())
for season in SEASONS: for season in SEASONS:
frame.loc[:, 'temp_{}_{}'.format(season, year)] += change frame.loc[:, 'temp_{}_{}'.format(season, year)] += change
columns = ['biome_num'] columns = ['latitude', 'longitude', 'biome_num']
new_data = pd.DataFrame(columns=columns) new_data = pd.DataFrame(columns=columns)
for i, chunk in enumerate(chunker(frame, B.batch_size)): for i, chunk in enumerate(chunker(frame, B.batch_size)):
input_data = normalize_ndarray(chunk.values) if chunk.shape[0] < B.batch_size:
continue
input_data = normalize_ndarray(chunk.loc[:, chunk.columns != 'longitude'].values)
out = B.predict(input_data) out = B.predict(input_data)
new_index = np.concatenate((chunk.index.values, new_data.index.values))
new_data = new_data.reindex(new_index) f = pd.DataFrame({
new_data.loc[chunk.index.values, 'biome_num'] = out 'longitude': chunk.loc[:, 'longitude'],
'latitude': chunk.loc[:, 'latitude'],
'biome_num': out
}, columns=columns)
new_data = new_data.append(f)
print(new_data)
draw(new_data) draw(new_data)

View File

@ -4,6 +4,6 @@ matplotlib==3.0.2
descartes==1.1.0 descartes==1.1.0
pysal==2.0.0 pysal==2.0.0
rasterio==1.0.15 rasterio==1.0.15
tensorflow==1.12.0 tensorflow==1.13.1
Cartopy==0.17.0 Cartopy==0.17.0
numpy==1.16.1 numpy==1.16.1

89
tracks Normal file
View File

@ -0,0 +1,89 @@
Layer (type) Output Shape Param #
=================================================================
Group 1
-----------------------------------------------------------------
dense (Dense) (None, 128) 1536
_________________________________________________________________
dense_1 (Dense) (None, 256) 33024
_________________________________________________________________
dense_2 (Dense) (None, 14) 3598
-----------------------------------------------------------------
Total params: 38,158
1 Epoch: loss: 0.3822 - acc: 0.8684
Learning rate: 0.005
=================================================================
Group 2
-----------------------------------------------------------------
dense (Dense) (None, 32) 384
_________________________________________________________________
dense_1 (Dense) (None, 64) 2112
_________________________________________________________________
dense_2 (Dense) (None, 32) 2080
_________________________________________________________________
dense_3 (Dense) (None, 14) 462
-----------------------------------------------------------------
Total params: 5,038
1 Epoch: loss: 0.3760 - acc: 0.8678 @ 20minutes
Stopped converging, loss increasing
Learning rate: 0.005
=================================================================
Group 3
-----------------------------------------------------------------
dense (Dense) (None, 16) 192
_________________________________________________________________
dense_1 (Dense) (None, 32) 544
_________________________________________________________________
dense_2 (Dense) (None, 16) 528
_________________________________________________________________
dense_3 (Dense) (None, 14) 238
-----------------------------------------------------------------
Total params: 1,502
1 Epoch: loss: 0.3702 - acc: 0.8671 @ 12minutes
10 Epochs: loss: 0.3280 - acc: 0.8815
Stopped converging after 5 epochs, was oscillating
Learning rate: 0.005
=================================================================
Group 4
_________________________________________________________________
dense (Dense) (None, 12) 144
_________________________________________________________________
dense_1 (Dense) (None, 14) 182
_________________________________________________________________
Total params: 326
1 Epoch: loss: 0.4412 - acc: 0.8457 @ 10m
60 Epochs: loss: 0.4146 - acc: 0.8546
Stopped converging
Learning rate: 0.005
=================================================================
Group 5
_________________________________________________________________
dense (Dense) (None, 12) 144
_________________________________________________________________
dense_1 (Dense) (None, 14) 182
_________________________________________________________________
Total params: 326
1 Epoch: loss: 0.5057 - acc: 0.8268 @ 10m
15 epoch: loss: 0.4240 - acc: 0.8481
Stopped converging
Learning rate: 0.001
=================================================================
Group 6
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 24) 288
_________________________________________________________________
dense_1 (Dense) (None, 14) 350
_________________________________________________________________
Total params: 638
1 Epoch: loss: 0.4520 - acc: 0.8416 @ 12m
30 epochs: loss: 0.3562 - acc: 0.8691, still converging
stopped converging after 100 epochs
Learning rate: 0.001

View File

@ -3,7 +3,7 @@ import tensorflow as tf
import pandas as pd import pandas as pd
from constants import * from constants import *
inputs = ['elevation', 'distance_to_water'] inputs = ['elevation', 'distance_to_water', 'latitude']
output = 'biome_num' output = 'biome_num'
def normalize(v): def normalize(v):
@ -18,7 +18,7 @@ def normalize_ndarray(ar):
def normalize_df(df): def normalize_df(df):
for col in df.columns: for col in df.columns:
df.loc[col] = normalize(df[col]) df.loc[col] = normalize_ndarray(df[col])
return df return df
@ -29,9 +29,24 @@ def dataframe_to_dataset_biomes(df):
# 3 for latitude, elevation and distance_to_water # 3 for latitude, elevation and distance_to_water
columns = 11 columns = 11
# make biomes uniformly distributed so each biome has enough data to avoid a biased dataset
biome_shares = df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] })
max_share = np.max(biome_shares['biome_num'])
dsize = df.shape[0]
max_share_count = int(max_share * dsize)
for biome_num in biome_shares.index:
share = biome_shares.values[biome_num][0]
share_count = int(share * dsize)
diff = max_share_count - share_count
rows = df.loc[df['biome_num'] == biome_num]
diff_ratio = int(diff / rows.shape[0])
df = pd.concat([df] + [rows] * diff_ratio, ignore_index=True)
# print(df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] }))
tf_inputs = np.empty((0, columns)) tf_inputs = np.empty((0, columns))
tf_output = np.empty((0)) tf_output = np.empty((0))
latitude = np.array(df.index.get_level_values(1))
for year in range(MIN_YEAR, MAX_YEAR + 1): for year in range(MIN_YEAR, MAX_YEAR + 1):
local_inputs = list(inputs) local_inputs = list(inputs)
@ -43,7 +58,6 @@ def dataframe_to_dataset_biomes(df):
local_df = df[local_inputs] local_df = df[local_inputs]
local_df.loc[:, 'latitude'] = pd.Series(latitude, index=local_df.index)
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0) tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
tf_output = np.concatenate((tf_output, df[output].values), axis=0) tf_output = np.concatenate((tf_output, df[output].values), axis=0)
@ -62,7 +76,6 @@ def dataframe_to_dataset_temp_precip(df):
tf_inputs = np.empty((0, columns)) tf_inputs = np.empty((0, columns))
tf_output = np.empty((0, 2)) tf_output = np.empty((0, 2))
latitude = np.array(df.index.get_level_values(1))
for year in range(MIN_YEAR, MAX_YEAR + 1): for year in range(MIN_YEAR, MAX_YEAR + 1):
local_inputs = list(inputs) local_inputs = list(inputs)
@ -70,7 +83,6 @@ def dataframe_to_dataset_temp_precip(df):
for idx, season in enumerate(SEASONS): for idx, season in enumerate(SEASONS):
season_index = idx / len(season) season_index = idx / len(season)
local_df = df[local_inputs] local_df = df[local_inputs]
local_df.loc[:, 'latitude'] = pd.Series(latitude, index=local_df.index)
local_df.loc[:, 'season'] = pd.Series(np.repeat(season_index, rows), index=local_df.index) local_df.loc[:, 'season'] = pd.Series(np.repeat(season_index, rows), index=local_df.index)
local_df.loc[:, 'year'] = pd.Series(np.repeat(year, rows), index=local_df.index) local_df.loc[:, 'year'] = pd.Series(np.repeat(year, rows), index=local_df.index)
@ -79,7 +91,10 @@ def dataframe_to_dataset_temp_precip(df):
tf_output = np.concatenate((tf_output, df[output].values), axis=0) tf_output = np.concatenate((tf_output, df[output].values), axis=0)
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32) tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
tf_output = tf.cast(normalize_ndarray(tf_output), tf.float32) tf_output = tf.cast(tf_output, tf.float32)
return int(tf_inputs.shape[0]), 5, 2, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) return int(tf_inputs.shape[0]), 5, 2, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
# df = pd.read_pickle('data.p')
# print(dataframe_to_dataset_biomes(df))