140 lines
4.6 KiB
Python
140 lines
4.6 KiB
Python
import numpy as np
|
|
import tensorflow as tf
|
|
import pandas as pd
|
|
from collections import Counter
|
|
from sklearn.utils import class_weight
|
|
from constants import *
|
|
import logging
|
|
import os
|
|
|
|
logger = logging.getLogger('main')
|
|
logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
|
|
|
|
|
|
def normalize(v, o=None):
|
|
if o is None:
|
|
o = v
|
|
return (v - np.mean(o)) / np.std(o)
|
|
|
|
def denormalize(v, o=None):
|
|
if o is None:
|
|
o = v
|
|
|
|
return (v * np.std(o) + np.mean(o))
|
|
|
|
def on_ndarray(ar, o=None, fn=None):
|
|
if o is None:
|
|
o = ar
|
|
|
|
# transpose: operate over columns
|
|
tr = np.transpose(ar)
|
|
to = np.transpose(o)
|
|
for i in range(tr.shape[0]):
|
|
tr[i] = fn(tr[i], to[i])
|
|
|
|
# transpose back
|
|
return np.transpose(tr)
|
|
|
|
def normalize_ndarray(ar, o=None):
|
|
return on_ndarray(ar, o=o, fn=normalize)
|
|
|
|
def denormalize_ndarray(ar, o=None):
|
|
return on_ndarray(ar, o=o, fn=denormalize)
|
|
|
|
def dataframe_to_dataset_biomes(df):
|
|
rows = df.shape[0]
|
|
|
|
# 8 for seasonal temp and precipitation
|
|
# 3 for latitude, elevation and distance_to_water
|
|
input_columns = 11
|
|
|
|
tf_inputs = np.empty((0, input_columns))
|
|
tf_output = np.empty((0))
|
|
|
|
for year in range(MIN_YEAR, MAX_YEAR + 1):
|
|
local_inputs = list(INPUTS)
|
|
for season in SEASONS:
|
|
local_inputs += [
|
|
'temp_{}_{}'.format(season, year),
|
|
'precip_{}_{}'.format(season, year)
|
|
]
|
|
|
|
|
|
local_df = df[local_inputs]
|
|
|
|
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
|
|
tf_output = np.concatenate((tf_output, df[OUTPUT].values), axis=0)
|
|
|
|
# balance class weights for the loss function, since the data is highly unbalanced
|
|
num_classes = len(np.unique(tf_output))
|
|
class_weights = class_weight.compute_class_weight('balanced', np.unique(tf_output), tf_output)
|
|
logger.debug('class_weights %s', class_weights)
|
|
|
|
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
|
|
tf_output = tf.cast(tf_output, tf.int64)
|
|
|
|
logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes)
|
|
return int(tf_inputs.shape[0]), input_columns, num_classes, class_weights, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
|
|
|
|
def dataframe_to_dataset_temp(df):
|
|
rows = df.shape[0]
|
|
|
|
# elevation, distance_to_water, latitude, mean_temp
|
|
input_columns = 4
|
|
# 4 seasons
|
|
num_classes = 4
|
|
|
|
tf_inputs = np.empty((0, input_columns))
|
|
tf_output = np.empty((0, num_classes))
|
|
|
|
for year in range(MIN_YEAR, MAX_YEAR + 1):
|
|
local_inputs = list(INPUTS)
|
|
local_df = df[local_inputs]
|
|
all_temps = ['temp_{}_{}'.format(season, year) for season in SEASONS]
|
|
local_df.loc[:, 'mean_temp'] = np.mean(df[all_temps].values)
|
|
|
|
output = all_temps
|
|
|
|
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
|
|
tf_output = np.concatenate((tf_output, df[output].values), axis=0)
|
|
|
|
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
|
|
tf_output = tf.cast(normalize_ndarray(tf_output), tf.float32)
|
|
|
|
logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes)
|
|
return int(tf_inputs.shape[0]), input_columns, num_classes, None, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
|
|
|
|
def dataframe_to_dataset_precip(df):
|
|
rows = df.shape[0]
|
|
|
|
# elevation, distance_to_water, latitude, mean_precip
|
|
input_columns = 4
|
|
# 4 seasons
|
|
num_classes = 4
|
|
|
|
tf_inputs = np.empty((0, input_columns))
|
|
tf_output = np.empty((0, num_classes))
|
|
|
|
for year in range(MIN_YEAR, MAX_YEAR + 1):
|
|
local_inputs = list(INPUTS)
|
|
local_df = df[local_inputs]
|
|
all_precips = ['precip_{}_{}'.format(season, year) for season in SEASONS]
|
|
local_df.loc[:, 'mean_precip'] = np.mean(df[all_precips].values)
|
|
|
|
output = all_precips
|
|
|
|
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
|
|
tf_output = np.concatenate((tf_output, df[output].values), axis=0)
|
|
|
|
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
|
|
tf_output = tf.cast(normalize_ndarray(tf_output), tf.float32)
|
|
|
|
logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes)
|
|
return int(tf_inputs.shape[0]), input_columns, num_classes, None, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
|
|
|
|
|
|
flatten = lambda l: [item for sublist in l for item in sublist]
|
|
|
|
def chunker(seq, size):
|
|
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
|