Source code for poets.io.source_base

# Copyright (c) 2014, Vienna University of Technology (TU Wien), Department
# of Geodesy and Geoinformation (GEO).
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
#   list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# * Neither the name of the Vienna University of Technology - Department of
#   Geodesy and Geoinformation nor the names of its contributors may be used to
#   endorse or promote products derived from this software without specific
#   prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY,
# DEPARTMENT OF GEODESY AND GEOINFORMATION BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# Author: Thomas Mistelbauer Thomas.Mistelbauer@geo.tuwien.ac.at
# Creation date: 2014-06-30

import os
from netCDF4 import Dataset, num2date, date2num
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import poets.image.netcdf as nt
import poets.timedate.dateindex as dt
from poets.image.resampling import resample_to_shape, average_layers
from poets.io.download import download_http, download_ftp, download_sftp, \
    get_file_date
from poets.timedate.dekad import check_dekad
from poets.image.netcdf import get_properties
from poets.grid.grids import ShapeGrid, RegularGrid


[docs]class BasicSource(object): """Base Class for data sources. Parameters ---------- name : str Name of the data source. filename : str Structure/convention of the file name. filedate : dict Position of date fields in filename, given as tuple. temp_res : str Temporal resolution of the source. rootpath : str Root path where all data will be stored. host : str Link to data host. protocol : str Protocol for data transfer. username : str, optional Username for data access. password : str, optional Password for data access. port : int, optional Port to data host, defaults to 22. directory : str, optional Path to data on host. dirstruct : list of strings Structure of source directory, each list item represents a subdirectory. begin_date : datetime, optional Date from which on data is available, defaults to 2000-01-01. variables : string or list of strings, optional Variables used from data source, defaults to ['dataset']. nan_value : int, float, optional Nan value of the original data as given by the data provider. dest_nan_value : int, float, optional NaN value in the final NetCDF file. dest_regions : list of str, optional Regions of interest where data should be resampled to. dest_sp_res : int, float, optional Spatial resolution of the destination NetCDF file, defaults to 0.25 degree. dest_temp_res : string, optional Temporal resolution of the destination NetCDF file, possible values: ('month', 'dekad'), defaults to dekad. dest_start_date : datetime, optional Start date of the destination NetCDF file, defaults to 2000-01-01. Attributes ---------- name : str Name of the data source. filename : str Structure/convention of the file name. filedate : dict Position of date fields in filename, given as tuple. temp_res : str Temporal resolution of the source. host : str Link to data host. protocol : str Protocol for data transfer. username : str Username for data access. password : str Password for data access. port : int Port to data host. directory : str Path to data on host. dirstruct : list of strings Structure of source directory, each list item represents a subdirectory. begin_date : datetime Date from which on data is available. variables : list of strings Variables used from data source. nan_value : int, float N a number value of the original data as given by the data provider. dest_nan_value : int, float, optional NaN value in the final NetCDF file. tmp_path : str Path where temporary files are stored. rawdata_path : str Path where original files are stored. data_path : str Path where resampled NetCDF file is stored. dest_regions : list of str Regions of interest where data is resampled to. dest_sp_res : int, float Spatial resolution of the destination NetCDF file. dest_temp_res : string Temporal resolution of the destination NetCDF file. """ def __init__(self, name, filename, filedate, temp_res, rootpath, host, protocol, username=None, password=None, port=22, directory=None, dirstruct=None, begin_date=datetime(2000, 1, 1), variables=None, nan_value=None, dest_nan_value=-99, dest_regions=None, dest_sp_res=0.25, dest_temp_res='dekad', dest_start_date=datetime(2000, 1, 1)): self.name = name self.filename = filename self.filedate = filedate self.temp_res = temp_res self.host = host self.protocol = protocol self.username = username self.password = password self.port = port self.directory = directory self.dirstruct = dirstruct self.begin_date = begin_date if type(variables) == str: self.variables = [variables] else: self.variables = variables self.nan_value = nan_value self.dest_nan_value = dest_nan_value self.dest_regions = dest_regions self.dest_sp_res = dest_sp_res self.dest_temp_res = dest_temp_res self.dest_start_date = dest_start_date self.rawdata_path = os.path.join(rootpath, 'RAWDATA', name) self.tmp_path = os.path.join(rootpath, 'TMP') self.data_path = os.path.join(rootpath, 'DATA') if self.host[-1] != '/': self.host += '/' if self.directory is not None and self.directory[-1] != '/': self.directory += '/' def _check_current_date(self, begin=True, end=True): """Helper method that checks the current date of individual variables in the netCDF data file. Parameters ---------- begin : bool, optional If set True, begin will be returned as None end : bool, optional If set True, end will be returned as None Returns ------- dates : dict of dicts None if no date available """ dates = {} for region in self.dest_regions: nc_name = os.path.join(self.data_path, region + '_' + str(self.dest_sp_res) + '_' + str(self.dest_temp_res) + '.nc') if os.path.exists(nc_name): dates[region] = {} variables = self.get_variables() with Dataset(nc_name, 'r', format='NETCDF4') as nc: for var in variables: dates[region][var] = [] if begin: for i in range(0, nc.variables['time'].size - 1): if nc.variables[var][i].mask.min(): continue else: times = nc.variables['time'] dat = num2date(nc.variables['time'][i], units=times.units, calendar=times.calendar) dates[region][var].append(dat) break else: dates[region][var].append(None) if end: for i in range(nc.variables['time'].size - 1, - 1, -1): if nc.variables[var][i].mask.min(): continue else: times = nc.variables['time'] dat = num2date(nc.variables['time'][i], units=times.units, calendar=times.calendar) dates[region][var].append(dat) break else: dates[region][var].append(None) if dates[region][var] in [[None], []]: dates[region][var] = [None, None] else: dates = None break return dates def _get_download_date(self): """Gets the date from which to start the data download. Returns ------- begin : datetime date from which to start the data download. """ dates = self._check_current_date(begin=False) if dates is not None: begin = datetime.now() for region in self.dest_regions: variables = self.get_variables() if variables == []: begin = self.dest_start_date else: for var in variables: if dates[region][var][1] is not None: if dates[region][var][1] < begin: begin = dates[region][var][1] begin += timedelta(days=1) else: if self.dest_start_date < self.begin_date: begin = self.begin_date else: begin = self.dest_start_date else: begin = self.begin_date return begin def _get_tmp_filepath(self, prefix, region): """Creates path to a temporary directory. Returns ------- str Path to the temporary direcotry """ filename = ('_' + prefix + '_' + region + '_' + str(self.dest_sp_res) + '_' + str(self.dest_temp_res) + '.nc') return os.path.join(self.tmp_path, filename) def _resample_spatial(self, region, begin, end, delete_rawdata, shapefile=None): """Helper method that calls spatial resampling routines. Parameters: region : str FIPS country code (https://en.wikipedia.org/wiki/FIPS_country_code) begin : datetime Start date of resampling end : datetime End date of resampling delete_rawdata : bool True if original downloaded files should be deleted after resampling """ raw_files = [] # filename if tmp file is used dest_file = self._get_tmp_filepath('spatial', region) dirList = os.listdir(self.rawdata_path) dirList.sort() for item in dirList: src_file = os.path.join(self.rawdata_path, item) raw_files.append(src_file) fdate = get_file_date(item, self.filedate) if begin is not None: if fdate < begin: continue if end is not None: if fdate > end: continue print '.', image, _, _, _, timestamp, metadata = \ resample_to_shape(src_file, region, self.dest_sp_res, self.name, self.nan_value, self.dest_nan_value, self.variables, shapefile) if timestamp is None: timestamp = get_file_date(item, self.filedate) if self.temp_res == self.dest_temp_res: filename = (region + '_' + str(self.dest_sp_res) + '_' + str(self.dest_temp_res) + '.nc') dfile = os.path.join(self.data_path, filename) nt.save_image(image, timestamp, region, metadata, dfile, self.dest_start_date, self.dest_sp_res, self.dest_nan_value, shapefile, self.dest_temp_res) else: nt.write_tmp_file(image, timestamp, region, metadata, dest_file, self.dest_start_date, self.dest_sp_res, self.dest_nan_value, shapefile) print '' def _resample_temporal(self, region, shapefile=None): """Helper method that calls temporal resampling routines. Parameters: region : str Identifier of the region in the shapefile. If the default shapefile is used, this would be the FIPS country code. shapefile : str, optional Path to shape file, uses "world country admin boundary shapefile" by default. """ src_file = self._get_tmp_filepath('spatial', region) if not os.path.exists(src_file): print '[Info] No data available for this period' return False data = {} variables, _, period = nt.get_properties(src_file) dtindex = dt.get_dtindex(self.dest_temp_res, period[0], period[1]) for date in dtindex: if date > period[1]: continue print date if self.dest_temp_res == 'dekad': if date.day < 21: begin = datetime(date.year, date.month, date.day - 10 + 1) else: begin = datetime(date.year, date.month, 21) end = date else: begin = period[0] end = date data = {} metadata = {} for var in variables: img, _, _, meta = \ nt.read_image(src_file, var, begin, end) metadata[var] = meta data[var] = average_layers(img, self.dest_nan_value) filename = (region + '_' + str(self.dest_sp_res) + '_' + str(self.dest_temp_res) + '.nc') dest_file = os.path.join(self.data_path, filename) nt.save_image(data, date, region, metadata, dest_file, self.dest_start_date, self.dest_sp_res, self.dest_nan_value, shapefile, self.dest_temp_res) # delete intermediate netCDF file print '' os.unlink(src_file)
[docs] def download(self, download_path=None, begin=None, end=None): """"Download data Parameters ---------- begin : datetime, optional start date of download, default to None end : datetime, optional start date of download, default to None """ if begin is None: if self.dest_start_date < self.begin_date: begin = self.begin_date else: begin = self.dest_start_date if self.protocol in ['HTTP', 'http']: check = download_http(self.rawdata_path, self.host, self.directory, self.filename, self.filedate, self.dirstruct, begin, end=end) elif self.protocol in ['FTP', 'ftp']: check = download_ftp(self.rawdata_path, self.host, self.directory, self.port, self.username, self.password, self.filedate, self.dirstruct, begin, end=end) elif self.protocol in ['SFTP', 'sftp']: check = download_sftp(self.rawdata_path, self.host, self.directory, self.port, self.username, self.password, self.filedate, self.dirstruct, begin, end=end) return check
[docs] def resample(self, begin=None, end=None, delete_rawdata=False, shapefile=None): """Resamples source data to given spatial and temporal resolution. Writes resampled images into a netCDF data file. Deletes original files if flag delete_rawdata is set True. Parameters ---------- begin : datetime Start date of resampling. end : datetime End date of resampling. delete_rawdata : bool Original files will be deleted from rawdata_path if set 'True'. shapefile : str, optional Path to shape file, uses "world country admin boundary shapefile" by default. """ for region in self.dest_regions: print '[INFO] resampling to region ' + region print '[INFO] performing spatial resampling ', self._resample_spatial(region, begin, end, delete_rawdata, shapefile) if self.temp_res == self.dest_temp_res: print '[INFO] skipping temporal resampling' else: print '[INFO] performing temporal resampling ', self._resample_temporal(region, shapefile) if delete_rawdata: print '[INFO] Cleaning up rawdata' dirList = os.listdir(self.rawdata_path) dirList.sort() for item in dirList: src_file = os.path.join(self.rawdata_path, item) os.unlink(src_file)
[docs] def download_and_resample(self, download_path=None, begin=None, end=None, delete_rawdata=False, shapefile=None): """Downloads and resamples data. Parameters ---------- download_path : str Path where to save the downloaded files. begin : datetime.date, optional set either to first date of remote repository or date of last file in local repository end : datetime.date, optional set to today if none given delete_rawdata : bool, optional Original files will be deleted from rawdata_path if set True shapefile : str, optional Path to shape file, uses "world country admin boundary shapefile" by default. """ if begin is None: if self.dest_start_date < self.begin_date: begin = self.begin_date else: begin = self.dest_start_date if begin < self._get_download_date(): begin = self._get_download_date() if end is None: end = datetime.now() if begin > end: print '[INFO] everything up to date' return '[INFO] everything up to date' drange = dt.get_dtindex(self.dest_temp_res, begin, end) for i, date in enumerate(drange): if date > end: continue if i == 0: start = begin else: if self.dest_temp_res == 'dekad': start = drange[i - 1] + timedelta(days=1) else: start = date stop = date filecheck = self.download(download_path, start, stop) if filecheck is True: self.resample(start, stop, delete_rawdata, shapefile) else: print '[WARNING] no data available for this date'
[docs] def read_ts(self, location, region=None, variable=None): """Gets timeseries from netCDF file for a gridpoint. Parameters ---------- location : int or tuple of floats Either Grid point index as integer value or Longitude/Latitude given as tuple. region : str, optional Region of interest, set to first defined region if not set. variable : str, optional Variable to display, selects all available variables if None. Returns ------- df : pd.DataFrame Timeseries for selected variables. """ if region is None: region = self.dest_regions[0] if type(location) is int: gp = location elif type(location) is tuple: if region == 'global': grid = RegularGrid(self.dest_sp_res) else: grid = ShapeGrid(region, self.dest_sp_res) gp, _ = grid.find_nearest_gpi(location[0], location[1]) if variable is None: if self.variables is None: variable = self.get_variables() else: variable = self.variables else: variable = [variable] source_file = os.path.join(self.data_path, region + '_' + str(self.dest_sp_res) + '_' + str(self.dest_temp_res) + '.nc') var_dates = self._check_current_date() with Dataset(source_file, 'r', format='NETCDF4') as nc: time = nc.variables['time'] dates = num2date(time[:], units=time.units, calendar=time.calendar) position = np.where(nc.variables['gpi'][:] == gp) lat_pos = position[0][0] lon_pos = position[1][0] df = pd.DataFrame(index=pd.DatetimeIndex(dates)) for var in variable: if self.name not in var: ncvar = self.name + '_' + var else: ncvar = var begin = np.where(dates == var_dates[region][ncvar][0])[0][0] end = np.where(dates == var_dates[region][ncvar][1])[0][0] df[ncvar] = np.NAN for i in range(begin, end + 1): df[ncvar][i] = nc.variables[ncvar][i, lat_pos, lon_pos] return df
[docs] def read_img(self, date, region=None, variable=None): """Gets images from netCDF file for certain date Parameters ---------- date : datetime Date of the image. region : str, optional Region of interest, set to first defined region if not set. variable : str, optional Variable to display, selects first available variables if None. Returns ------- img : numpy.ndarray Image of selected date. lon : numpy.array Array with longitudes. lat : numpy.array Array with latitudes. """ if region is None: region = self.dest_regions[0] if variable is None: if self.variables is None: variable = self.get_variables()[0] else: variable = self.name + '_' + self.variables[0] else: # Renames variable name to SOURCE_variable if self.name not in variable: variable = self.name + '_' + variable source_file = os.path.join(self.data_path, region + '_' + str(self.dest_sp_res) + '_' + str(self.dest_temp_res) + '.nc') # get dekad of date: date = check_dekad(date) with Dataset(source_file, 'r', format='NETCDF4') as nc: time = nc.variables['time'] datenum = date2num(date, units=time.units, calendar=time.calendar) position = np.where(time[:] == datenum)[0][0] img = nc.variables[variable][position] lon = nc.variables['lon'][:] lat = nc.variables['lat'][:] return img, lon, lat
[docs] def get_variables(self): """ Gets all variables from source given in the NetCDF file. Returns ------- variables : list of str Variables from source given in NetCDF file. """ nc_name = os.path.join(self.data_path, self.dest_regions[0] + '_' + str(self.dest_sp_res) + '_' + str(self.dest_temp_res) + '.nc') nc_vars, _, _ = get_properties(nc_name) variables = [] for var in nc_vars: if self.name in var: variables.append(var) return variables
if __name__ == "__main__": pass