2019-02-12 05:11:33 +00:00
|
|
|
import numpy as np
|
|
|
|
import tensorflow as tf
|
|
|
|
import pandas as pd
|
|
|
|
from constants import *
|
|
|
|
|
2019-03-05 07:59:30 +00:00
|
|
|
inputs = ['elevation', 'distance_to_water', 'latitude']
|
2019-02-12 05:11:33 +00:00
|
|
|
output = 'biome_num'
|
|
|
|
|
2019-02-14 09:06:09 +00:00
|
|
|
def normalize(v):
|
2019-02-28 13:52:50 +00:00
|
|
|
return (v - np.mean(v)) / np.std(v)
|
2019-02-14 09:06:09 +00:00
|
|
|
|
|
|
|
def normalize_ndarray(ar):
|
|
|
|
tr = np.transpose(ar)
|
|
|
|
for i in range(tr.shape[0]):
|
|
|
|
tr[i] = normalize(tr[i])
|
|
|
|
|
|
|
|
return np.transpose(tr)
|
|
|
|
|
|
|
|
def normalize_df(df):
|
|
|
|
for col in df.columns:
|
2019-03-05 07:59:30 +00:00
|
|
|
df.loc[col] = normalize_ndarray(df[col])
|
2019-02-14 09:06:09 +00:00
|
|
|
|
|
|
|
return df
|
|
|
|
|
2019-02-12 05:11:33 +00:00
|
|
|
def dataframe_to_dataset_biomes(df):
|
|
|
|
rows = df.shape[0]
|
|
|
|
|
|
|
|
# 8 for seasonal temp and precipitation
|
|
|
|
# 3 for latitude, elevation and distance_to_water
|
|
|
|
columns = 11
|
|
|
|
|
2019-03-05 07:59:30 +00:00
|
|
|
# make biomes uniformly distributed so each biome has enough data to avoid a biased dataset
|
|
|
|
biome_shares = df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] })
|
|
|
|
max_share = np.max(biome_shares['biome_num'])
|
|
|
|
dsize = df.shape[0]
|
|
|
|
max_share_count = int(max_share * dsize)
|
|
|
|
|
|
|
|
for biome_num in biome_shares.index:
|
|
|
|
share = biome_shares.values[biome_num][0]
|
|
|
|
share_count = int(share * dsize)
|
|
|
|
diff = max_share_count - share_count
|
|
|
|
rows = df.loc[df['biome_num'] == biome_num]
|
|
|
|
diff_ratio = int(diff / rows.shape[0])
|
|
|
|
df = pd.concat([df] + [rows] * diff_ratio, ignore_index=True)
|
|
|
|
|
|
|
|
# print(df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] }))
|
|
|
|
|
2019-02-12 05:11:33 +00:00
|
|
|
tf_inputs = np.empty((0, columns))
|
|
|
|
tf_output = np.empty((0))
|
|
|
|
|
|
|
|
for year in range(MIN_YEAR, MAX_YEAR + 1):
|
|
|
|
local_inputs = list(inputs)
|
|
|
|
for season in SEASONS:
|
|
|
|
local_inputs += [
|
|
|
|
'temp_{}_{}'.format(season, year),
|
|
|
|
'precip_{}_{}'.format(season, year)
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
local_df = df[local_inputs]
|
|
|
|
|
|
|
|
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
|
|
|
|
tf_output = np.concatenate((tf_output, df[output].values), axis=0)
|
|
|
|
|
2019-02-14 09:06:09 +00:00
|
|
|
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
|
2019-02-26 08:20:31 +00:00
|
|
|
tf_output = tf.cast(tf_output, tf.int64)
|
2019-02-14 09:06:09 +00:00
|
|
|
|
2019-02-26 08:20:31 +00:00
|
|
|
return int(tf_inputs.shape[0]), 11, 14, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
|
2019-02-14 09:06:09 +00:00
|
|
|
|
|
|
|
def dataframe_to_dataset_temp_precip(df):
|
|
|
|
rows = df.shape[0]
|
|
|
|
|
|
|
|
# elevation, distance_to_water, latitude
|
|
|
|
# season, year
|
|
|
|
columns = 5
|
|
|
|
|
|
|
|
tf_inputs = np.empty((0, columns))
|
|
|
|
tf_output = np.empty((0, 2))
|
|
|
|
|
|
|
|
for year in range(MIN_YEAR, MAX_YEAR + 1):
|
|
|
|
local_inputs = list(inputs)
|
|
|
|
|
|
|
|
for idx, season in enumerate(SEASONS):
|
|
|
|
season_index = idx / len(season)
|
|
|
|
local_df = df[local_inputs]
|
|
|
|
local_df.loc[:, 'season'] = pd.Series(np.repeat(season_index, rows), index=local_df.index)
|
|
|
|
local_df.loc[:, 'year'] = pd.Series(np.repeat(year, rows), index=local_df.index)
|
|
|
|
|
|
|
|
output = ['temp_{}_{}'.format(season, year), 'precip_{}_{}'.format(season, year)]
|
|
|
|
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
|
|
|
|
tf_output = np.concatenate((tf_output, df[output].values), axis=0)
|
|
|
|
|
|
|
|
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
|
2019-03-05 07:59:30 +00:00
|
|
|
tf_output = tf.cast(tf_output, tf.float32)
|
2019-02-12 05:11:33 +00:00
|
|
|
|
2019-02-26 08:20:31 +00:00
|
|
|
return int(tf_inputs.shape[0]), 5, 2, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
|
2019-02-14 09:06:09 +00:00
|
|
|
|
2019-03-05 07:59:30 +00:00
|
|
|
|
|
|
|
# df = pd.read_pickle('data.p')
|
|
|
|
# print(dataframe_to_dataset_biomes(df))
|