From caa1b0443c037fec89ae90d85630eb93193d8208 Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Fri, 8 Feb 2019 18:14:57 +0330 Subject: [PATCH] fix(data.py): optimize for optimal performance and generate data --- .gitignore | 1 + data.py | 102 +++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 73 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 60b197a..0361c94 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ geodata +*.p #### joe made this: http://goel.io/joe #### python #### # Byte-compiled / optimized / DLL files diff --git a/data.py b/data.py index 6001b0e..f6feaa4 100644 --- a/data.py +++ b/data.py @@ -2,16 +2,19 @@ import geopandas import os import rasterio import pandas as pd +import numpy as np +import time from matplotlib import pyplot from shapely.geometry import Point directory = os.path.dirname(os.path.abspath(__file__)) GEODATA = os.path.join(directory, 'geodata') -ECOREGIONS = os.path.join(GEODATA, 'ecoregions', 'Ecoregions2017.shp') +ECOREGIONS = os.path.join(GEODATA, 'ecoregions', 'single-parts.shp') ELEVATION = os.path.join(GEODATA, 'srtm', 'topo30-180.tif') TEMP = os.path.join(GEODATA, 'air_temp') PRECIP = os.path.join(GEODATA, 'precipitation') +YEAR = 2014 def read_temp_data(year): return pd.read_csv(os.path.join(TEMP, 'air_temp.{}'.format(year)), sep='\s+', header=None, @@ -30,45 +33,84 @@ def read_precip_data(year): 'december', 'yearly_avg']) eco = geopandas.read_file(ECOREGIONS) + elevation = rasterio.open(ELEVATION) +elevation_data = elevation.read(1) -# world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) -# world['geometry'] = world['geometry'].unary_union +temp = read_temp_data(YEAR) -# world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) -# print(world.head()) -# world = world[['continent', 'geometry']] -# continents = world.dissolve(level=1) -# continents.plot(); +precip = read_precip_data(YEAR) +precip['yearly_avg'] = precip.mean(axis=1) -# print(eco.head()) -# print(elevation) +print('# Elevation') +print('bounds: left={} bottom={} top={} right={}'.format(elevation.bounds.left, elevation.bounds.bottom, elevation.bounds.top, elevation.bounds.right)) +print('min: {}, max: {}\n'.format(elevation_data.min(), elevation_data.max())) -# eco['geometry'].boundary.plot() -# eco.dissolve() +print('# Temperature ({})'.format(YEAR)) +print('Yearly average min: {}, max: {}\n'.format(temp.yearly_avg.min(), temp.yearly_avg.max())) + +print('# Precipitation ({})'.format(YEAR)) +print('Yearly average min: {}, max: {}\n'.format(precip.yearly_avg.min(), precip.yearly_avg.max())) + +columns = ['biome_num', 'biome_name', 'elevation', 'temp_yearly_avg', 'precip_yearly_avg'] +indices = ['longitude', 'latitude'] +final_data = pd.DataFrame(index=indices, columns=columns) -# eco.plot() -# # rasterio.plot.show(src) -# # pyplot.imshow(elevation.read(1)) -# 51.42 -# 35.69 -# tehran = eco.geometry.contains() def get_point_information(longitude, latitude): + start_time = time.time() p = Point(longitude, latitude) + # print('({},{})'.format(longitude, latitude)) ecoregion = eco.loc[lambda c: c.geometry.contains(p)] + print("er%ss" % (time.time() - start_time)) + if ecoregion.empty: + return False + start_time = time.time() + elev = elevation_data[elevation.index(longitude, latitude)] + start_time = time.time() + t = np.argmin(np.array((temp.longitude - longitude)**2 + (temp.latitude - latitude)**2)) + start_time = time.time() + p = np.argmin(np.array((precip.longitude - longitude)**2 + (precip.latitude - latitude)**2)) + return { - 'biome_num': ecoregion.loc['BIOME_NUM'].iloc[0], - 'biome_name': ecoregion.loc['BIOME_NAME'].iloc[0], + 'biome_num': ecoregion.BIOME_NUM.iloc[0], + 'biome_name': ecoregion.BIOME_NAME.iloc[0], + 'elevation': elev, + 'temp_yearly_avg': temp.iloc[t, 2:].yearly_avg, + 'precip_yearly_avg': precip.iloc[p, 2:].yearly_avg } -import time + + +data_indices = [] + +data_map = {} +for col in columns: + data_map[col] = {} + +i = 0 + start_time = time.time() -print('Before call') -print('Tehran', get_point_information(51.42, 35.69)) -print("--- %s seconds ---" % (time.time() - start_time)) + +for longitude in range(-179, 179): + print('-', end='') + for latitude in range(-89, 89): + # generate data and save to file + d = get_point_information(longitude, latitude) + if d == False: + print('.', end='') + continue + + for key, value in d.items(): + data_map[key][(longitude, latitude)] = value + + print('+', end='') + print('') + +print("--- Calculations: %s seconds ---" % (time.time() - start_time)) + start_time = time.time() -print('Amazon', get_point_information(-59.78, -5.5)) -print("--- %s seconds ---" % (time.time() - start_time)) -# print(eco.geometry) -# print(tehran.distance(world.boundary)) -# world.boundary.plot() -pyplot.show() +df = pd.DataFrame(data_map) +print("--- Generating DataFrame: %s seconds ---" % (time.time() - start_time)) +print(df.head()) +start_time = time.time() +df.to_pickle('data.p') +print("--- Pickling DataFrame: %s seconds ---" % (time.time() - start_time))