import numpy as np import tensorflow as tf import pandas as pd from collections import Counter from sklearn.utils import class_weight from constants import * import logging import os logger = logging.getLogger('main') logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO')) def normalize(v, o=None): if o is None: o = v return (v - np.mean(o)) / np.std(o) def denormalize(v, o=None): if o is None: o = v return (v * np.std(o) + np.mean(o)) def on_ndarray(ar, o=None, fn=None): if o is None: o = ar # transpose: operate over columns tr = np.transpose(ar) to = np.transpose(o) for i in range(tr.shape[0]): tr[i] = fn(tr[i], to[i]) # transpose back return np.transpose(tr) def normalize_ndarray(ar, o=None): return on_ndarray(ar, o=o, fn=normalize) def denormalize_ndarray(ar, o=None): return on_ndarray(ar, o=o, fn=denormalize) def dataframe_to_dataset_biomes(df): rows = df.shape[0] # 8 for seasonal temp and precipitation # 3 for latitude, elevation and distance_to_water input_columns = 11 tf_inputs = np.empty((0, input_columns)) tf_output = np.empty((0)) for year in range(MIN_YEAR, MAX_YEAR + 1): local_inputs = list(INPUTS) for season in SEASONS: local_inputs += [ 'temp_{}_{}'.format(season, year), 'precip_{}_{}'.format(season, year) ] local_df = df[local_inputs] tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0) tf_output = np.concatenate((tf_output, df[OUTPUT].values), axis=0) # balance class weights for the loss function, since the data is highly unbalanced num_classes = len(np.unique(tf_output)) class_weights = class_weight.compute_class_weight('balanced', np.unique(tf_output), tf_output) logger.debug('class_weights %s', class_weights) tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32) tf_output = tf.cast(tf_output, tf.int64) logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes) return int(tf_inputs.shape[0]), input_columns, num_classes, class_weights, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) def dataframe_to_dataset_temp_precip(df): rows = df.shape[0] # elevation, distance_to_water, latitude, mean_temp, mean_precip input_columns = 5 # (temp, precip) * 4 seasons num_classes = 8 tf_inputs = np.empty((0, input_columns)) tf_output = np.empty((0, num_classes)) for year in range(MIN_YEAR, MAX_YEAR + 1): local_inputs = list(INPUTS) local_df = df[local_inputs] all_temps = ['temp_{}_{}'.format(season, year) for season in SEASONS] all_precips = ['precip_{}_{}'.format(season, year) for season in SEASONS] local_df.loc[:, 'mean_temp'] = np.mean(df[all_temps].values) local_df.loc[:, 'mean_precip'] = np.mean(df[all_precips].values) output = [] output = all_temps + all_precips tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0) tf_output = np.concatenate((tf_output, df[output].values), axis=0) tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32) tf_output = tf.cast(tf_output, tf.float32) logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes) return int(tf_inputs.shape[0]), input_columns, num_classes, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) flatten = lambda l: [item for sublist in l for item in sublist] def chunker(seq, size): return (seq[pos:pos + size] for pos in range(0, len(seq), size))