world-ecoregion/utils.py

102 lines
3.4 KiB
Python
Raw Normal View History

import numpy as np
import tensorflow as tf
import pandas as pd
from collections import Counter
from sklearn.utils import class_weight
from constants import *
import logging
import os
logger = logging.getLogger('main')
logger.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
def normalize(v, o=None):
if o is None:
o = v
return (v - np.mean(o)) / np.std(o)
def normalize_ndarray(ar, o=None):
if o is None:
o = ar
# transpose: operate over columns
tr = np.transpose(ar)
to = np.transpose(o)
for i in range(tr.shape[0]):
tr[i] = normalize(tr[i], to[i])
# transpose back
return np.transpose(tr)
def dataframe_to_dataset_biomes(df):
rows = df.shape[0]
# 8 for seasonal temp and precipitation
# 3 for latitude, elevation and distance_to_water
input_columns = 11
tf_inputs = np.empty((0, input_columns))
tf_output = np.empty((0))
for year in range(MIN_YEAR, MAX_YEAR + 1):
local_inputs = list(INPUTS)
for season in SEASONS:
local_inputs += [
'temp_{}_{}'.format(season, year),
'precip_{}_{}'.format(season, year)
]
local_df = df[local_inputs]
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
tf_output = np.concatenate((tf_output, df[OUTPUT].values), axis=0)
# balance class weights for the loss function, since the data is highly unbalanced
num_classes = len(np.unique(tf_output))
class_weights = class_weight.compute_class_weight('balanced', np.unique(tf_output), tf_output)
logger.debug('class_weights %s', class_weights)
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
tf_output = tf.cast(tf_output, tf.int64)
logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes)
return int(tf_inputs.shape[0]), input_columns, num_classes, class_weights, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
def dataframe_to_dataset_temp_precip(df):
rows = df.shape[0]
# elevation, distance_to_water, latitude
# season, year
input_columns = 5
num_classes = 2
tf_inputs = np.empty((0, input_columns))
tf_output = np.empty((0, num_classes))
for year in range(MIN_YEAR, MAX_YEAR + 1):
local_inputs = list(INPUTS)
for idx, season in enumerate(SEASONS):
season_index = idx / len(season)
local_df = df[local_inputs]
local_df.loc[:, 'season'] = pd.Series(np.repeat(season_index, rows), index=local_df.index)
local_df.loc[:, 'year'] = pd.Series(np.repeat(year, rows), index=local_df.index)
output = ['temp_{}_{}'.format(season, year), 'precip_{}_{}'.format(season, year)]
tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0)
tf_output = np.concatenate((tf_output, df[output].values), axis=0)
tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32)
tf_output = tf.cast(tf_output, tf.float32)
logger.debug('dataset size: rows=%d, input_columns=%d, num_classes=%d', int(tf_inputs.shape[0]), input_columns, num_classes)
return int(tf_inputs.shape[0]), input_columns, num_classes, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output))
flatten = lambda l: [item for sublist in l for item in sublist]
def chunker(seq, size):
return (seq[pos:pos + size] for pos in range(0, len(seq), size))