import numpy as np import tensorflow as tf import pandas as pd from constants import * inputs = ['elevation', 'distance_to_water', 'latitude'] output = 'biome_num' def normalize(v): return (v - np.mean(v)) / np.std(v) def normalize_ndarray(ar): tr = np.transpose(ar) for i in range(tr.shape[0]): tr[i] = normalize(tr[i]) return np.transpose(tr) def normalize_df(df): for col in df.columns: df.loc[col] = normalize_ndarray(df[col]) return df def dataframe_to_dataset_biomes(df): rows = df.shape[0] # 8 for seasonal temp and precipitation # 3 for latitude, elevation and distance_to_water columns = 11 # make biomes uniformly distributed so each biome has enough data to avoid a biased dataset biome_shares = df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] }) max_share = np.max(biome_shares['biome_num']) dsize = df.shape[0] max_share_count = int(max_share * dsize) for biome_num in biome_shares.index: share = biome_shares.values[biome_num][0] share_count = int(share * dsize) diff = max_share_count - share_count rows = df.loc[df['biome_num'] == biome_num] diff_ratio = int(diff / rows.shape[0]) df = pd.concat([df] + [rows] * diff_ratio, ignore_index=True) # print(df.groupby(['biome_num']).agg({ 'biome_num': lambda x: x.count() / df.shape[0] })) tf_inputs = np.empty((0, columns)) tf_output = np.empty((0)) for year in range(MIN_YEAR, MAX_YEAR + 1): local_inputs = list(inputs) for season in SEASONS: local_inputs += [ 'temp_{}_{}'.format(season, year), 'precip_{}_{}'.format(season, year) ] local_df = df[local_inputs] tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0) tf_output = np.concatenate((tf_output, df[output].values), axis=0) tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32) tf_output = tf.cast(tf_output, tf.int64) return int(tf_inputs.shape[0]), 11, 14, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) def dataframe_to_dataset_temp_precip(df): rows = df.shape[0] # elevation, distance_to_water, latitude # season, year columns = 5 tf_inputs = np.empty((0, columns)) tf_output = np.empty((0, 2)) for year in range(MIN_YEAR, MAX_YEAR + 1): local_inputs = list(inputs) for idx, season in enumerate(SEASONS): season_index = idx / len(season) local_df = df[local_inputs] local_df.loc[:, 'season'] = pd.Series(np.repeat(season_index, rows), index=local_df.index) local_df.loc[:, 'year'] = pd.Series(np.repeat(year, rows), index=local_df.index) output = ['temp_{}_{}'.format(season, year), 'precip_{}_{}'.format(season, year)] tf_inputs = np.concatenate((tf_inputs, local_df.values), axis=0) tf_output = np.concatenate((tf_output, df[output].values), axis=0) tf_inputs = tf.cast(normalize_ndarray(tf_inputs), tf.float32) tf_output = tf.cast(tf_output, tf.float32) return int(tf_inputs.shape[0]), 5, 2, tf.data.Dataset.from_tensor_slices((tf_inputs, tf_output)) # df = pd.read_pickle('data.p') # print(dataframe_to_dataset_biomes(df))