fix(data.py): optimize for optimal performance and generate data

This commit is contained in:
Mahdi Dibaiee 2019-02-08 18:14:57 +03:30
parent 902be97332
commit caa1b0443c
2 changed files with 73 additions and 30 deletions

1
.gitignore vendored
View File

@ -1,4 +1,5 @@
geodata
*.p
#### joe made this: http://goel.io/joe
#### python ####
# Byte-compiled / optimized / DLL files

102
data.py
View File

@ -2,16 +2,19 @@ import geopandas
import os
import rasterio
import pandas as pd
import numpy as np
import time
from matplotlib import pyplot
from shapely.geometry import Point
directory = os.path.dirname(os.path.abspath(__file__))
GEODATA = os.path.join(directory, 'geodata')
ECOREGIONS = os.path.join(GEODATA, 'ecoregions', 'Ecoregions2017.shp')
ECOREGIONS = os.path.join(GEODATA, 'ecoregions', 'single-parts.shp')
ELEVATION = os.path.join(GEODATA, 'srtm', 'topo30-180.tif')
TEMP = os.path.join(GEODATA, 'air_temp')
PRECIP = os.path.join(GEODATA, 'precipitation')
YEAR = 2014
def read_temp_data(year):
return pd.read_csv(os.path.join(TEMP, 'air_temp.{}'.format(year)), sep='\s+', header=None,
@ -30,45 +33,84 @@ def read_precip_data(year):
'december', 'yearly_avg'])
eco = geopandas.read_file(ECOREGIONS)
elevation = rasterio.open(ELEVATION)
elevation_data = elevation.read(1)
# world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# world['geometry'] = world['geometry'].unary_union
temp = read_temp_data(YEAR)
# world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# print(world.head())
# world = world[['continent', 'geometry']]
# continents = world.dissolve(level=1)
# continents.plot();
precip = read_precip_data(YEAR)
precip['yearly_avg'] = precip.mean(axis=1)
# print(eco.head())
# print(elevation)
print('# Elevation')
print('bounds: left={} bottom={} top={} right={}'.format(elevation.bounds.left, elevation.bounds.bottom, elevation.bounds.top, elevation.bounds.right))
print('min: {}, max: {}\n'.format(elevation_data.min(), elevation_data.max()))
# eco['geometry'].boundary.plot()
# eco.dissolve()
print('# Temperature ({})'.format(YEAR))
print('Yearly average min: {}, max: {}\n'.format(temp.yearly_avg.min(), temp.yearly_avg.max()))
print('# Precipitation ({})'.format(YEAR))
print('Yearly average min: {}, max: {}\n'.format(precip.yearly_avg.min(), precip.yearly_avg.max()))
columns = ['biome_num', 'biome_name', 'elevation', 'temp_yearly_avg', 'precip_yearly_avg']
indices = ['longitude', 'latitude']
final_data = pd.DataFrame(index=indices, columns=columns)
# eco.plot()
# # rasterio.plot.show(src)
# # pyplot.imshow(elevation.read(1))
# 51.42
# 35.69
# tehran = eco.geometry.contains()
def get_point_information(longitude, latitude):
start_time = time.time()
p = Point(longitude, latitude)
# print('({},{})'.format(longitude, latitude))
ecoregion = eco.loc[lambda c: c.geometry.contains(p)]
print("er%ss" % (time.time() - start_time))
if ecoregion.empty:
return False
start_time = time.time()
elev = elevation_data[elevation.index(longitude, latitude)]
start_time = time.time()
t = np.argmin(np.array((temp.longitude - longitude)**2 + (temp.latitude - latitude)**2))
start_time = time.time()
p = np.argmin(np.array((precip.longitude - longitude)**2 + (precip.latitude - latitude)**2))
return {
'biome_num': ecoregion.loc['BIOME_NUM'].iloc[0],
'biome_name': ecoregion.loc['BIOME_NAME'].iloc[0],
'biome_num': ecoregion.BIOME_NUM.iloc[0],
'biome_name': ecoregion.BIOME_NAME.iloc[0],
'elevation': elev,
'temp_yearly_avg': temp.iloc[t, 2:].yearly_avg,
'precip_yearly_avg': precip.iloc[p, 2:].yearly_avg
}
import time
data_indices = []
data_map = {}
for col in columns:
data_map[col] = {}
i = 0
start_time = time.time()
print('Before call')
print('Tehran', get_point_information(51.42, 35.69))
print("--- %s seconds ---" % (time.time() - start_time))
for longitude in range(-179, 179):
print('-', end='')
for latitude in range(-89, 89):
# generate data and save to file
d = get_point_information(longitude, latitude)
if d == False:
print('.', end='')
continue
for key, value in d.items():
data_map[key][(longitude, latitude)] = value
print('+', end='')
print('')
print("--- Calculations: %s seconds ---" % (time.time() - start_time))
start_time = time.time()
print('Amazon', get_point_information(-59.78, -5.5))
print("--- %s seconds ---" % (time.time() - start_time))
# print(eco.geometry)
# print(tehran.distance(world.boundary))
# world.boundary.plot()
pyplot.show()
df = pd.DataFrame(data_map)
print("--- Generating DataFrame: %s seconds ---" % (time.time() - start_time))
print(df.head())
start_time = time.time()
df.to_pickle('data.p')
print("--- Pickling DataFrame: %s seconds ---" % (time.time() - start_time))