fix(data.py): optimize for optimal performance and generate data

This commit is contained in:
Mahdi Dibaiee 2019-02-08 18:14:57 +03:30
parent 902be97332
commit caa1b0443c
2 changed files with 73 additions and 30 deletions

1
.gitignore vendored
View File

@ -1,4 +1,5 @@
geodata geodata
*.p
#### joe made this: http://goel.io/joe #### joe made this: http://goel.io/joe
#### python #### #### python ####
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files

102
data.py
View File

@ -2,16 +2,19 @@ import geopandas
import os import os
import rasterio import rasterio
import pandas as pd import pandas as pd
import numpy as np
import time
from matplotlib import pyplot from matplotlib import pyplot
from shapely.geometry import Point from shapely.geometry import Point
directory = os.path.dirname(os.path.abspath(__file__)) directory = os.path.dirname(os.path.abspath(__file__))
GEODATA = os.path.join(directory, 'geodata') GEODATA = os.path.join(directory, 'geodata')
ECOREGIONS = os.path.join(GEODATA, 'ecoregions', 'Ecoregions2017.shp') ECOREGIONS = os.path.join(GEODATA, 'ecoregions', 'single-parts.shp')
ELEVATION = os.path.join(GEODATA, 'srtm', 'topo30-180.tif') ELEVATION = os.path.join(GEODATA, 'srtm', 'topo30-180.tif')
TEMP = os.path.join(GEODATA, 'air_temp') TEMP = os.path.join(GEODATA, 'air_temp')
PRECIP = os.path.join(GEODATA, 'precipitation') PRECIP = os.path.join(GEODATA, 'precipitation')
YEAR = 2014
def read_temp_data(year): def read_temp_data(year):
return pd.read_csv(os.path.join(TEMP, 'air_temp.{}'.format(year)), sep='\s+', header=None, return pd.read_csv(os.path.join(TEMP, 'air_temp.{}'.format(year)), sep='\s+', header=None,
@ -30,45 +33,84 @@ def read_precip_data(year):
'december', 'yearly_avg']) 'december', 'yearly_avg'])
eco = geopandas.read_file(ECOREGIONS) eco = geopandas.read_file(ECOREGIONS)
elevation = rasterio.open(ELEVATION) elevation = rasterio.open(ELEVATION)
elevation_data = elevation.read(1)
# world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) temp = read_temp_data(YEAR)
# world['geometry'] = world['geometry'].unary_union
# world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres')) precip = read_precip_data(YEAR)
# print(world.head()) precip['yearly_avg'] = precip.mean(axis=1)
# world = world[['continent', 'geometry']]
# continents = world.dissolve(level=1)
# continents.plot();
# print(eco.head()) print('# Elevation')
# print(elevation) print('bounds: left={} bottom={} top={} right={}'.format(elevation.bounds.left, elevation.bounds.bottom, elevation.bounds.top, elevation.bounds.right))
print('min: {}, max: {}\n'.format(elevation_data.min(), elevation_data.max()))
# eco['geometry'].boundary.plot() print('# Temperature ({})'.format(YEAR))
# eco.dissolve() print('Yearly average min: {}, max: {}\n'.format(temp.yearly_avg.min(), temp.yearly_avg.max()))
print('# Precipitation ({})'.format(YEAR))
print('Yearly average min: {}, max: {}\n'.format(precip.yearly_avg.min(), precip.yearly_avg.max()))
columns = ['biome_num', 'biome_name', 'elevation', 'temp_yearly_avg', 'precip_yearly_avg']
indices = ['longitude', 'latitude']
final_data = pd.DataFrame(index=indices, columns=columns)
# eco.plot()
# # rasterio.plot.show(src)
# # pyplot.imshow(elevation.read(1))
# 51.42
# 35.69
# tehran = eco.geometry.contains()
def get_point_information(longitude, latitude): def get_point_information(longitude, latitude):
start_time = time.time()
p = Point(longitude, latitude) p = Point(longitude, latitude)
# print('({},{})'.format(longitude, latitude))
ecoregion = eco.loc[lambda c: c.geometry.contains(p)] ecoregion = eco.loc[lambda c: c.geometry.contains(p)]
print("er%ss" % (time.time() - start_time))
if ecoregion.empty:
return False
start_time = time.time()
elev = elevation_data[elevation.index(longitude, latitude)]
start_time = time.time()
t = np.argmin(np.array((temp.longitude - longitude)**2 + (temp.latitude - latitude)**2))
start_time = time.time()
p = np.argmin(np.array((precip.longitude - longitude)**2 + (precip.latitude - latitude)**2))
return { return {
'biome_num': ecoregion.loc['BIOME_NUM'].iloc[0], 'biome_num': ecoregion.BIOME_NUM.iloc[0],
'biome_name': ecoregion.loc['BIOME_NAME'].iloc[0], 'biome_name': ecoregion.BIOME_NAME.iloc[0],
'elevation': elev,
'temp_yearly_avg': temp.iloc[t, 2:].yearly_avg,
'precip_yearly_avg': precip.iloc[p, 2:].yearly_avg
} }
import time
data_indices = []
data_map = {}
for col in columns:
data_map[col] = {}
i = 0
start_time = time.time() start_time = time.time()
print('Before call')
print('Tehran', get_point_information(51.42, 35.69)) for longitude in range(-179, 179):
print("--- %s seconds ---" % (time.time() - start_time)) print('-', end='')
for latitude in range(-89, 89):
# generate data and save to file
d = get_point_information(longitude, latitude)
if d == False:
print('.', end='')
continue
for key, value in d.items():
data_map[key][(longitude, latitude)] = value
print('+', end='')
print('')
print("--- Calculations: %s seconds ---" % (time.time() - start_time))
start_time = time.time() start_time = time.time()
print('Amazon', get_point_information(-59.78, -5.5)) df = pd.DataFrame(data_map)
print("--- %s seconds ---" % (time.time() - start_time)) print("--- Generating DataFrame: %s seconds ---" % (time.time() - start_time))
# print(eco.geometry) print(df.head())
# print(tehran.distance(world.boundary)) start_time = time.time()
# world.boundary.plot() df.to_pickle('data.p')
pyplot.show() print("--- Pickling DataFrame: %s seconds ---" % (time.time() - start_time))