# --- # # # # Introductory Notes # Previously, OPSD chose to supply only a **documented methodological script** for weather data, given that hosting global meteorological datasets like MERRA-2 would not be feasible due to their size (variety of variables, very long timespan, huge geographical coverage etc.). # # We now also make country-aggregated weather data available directly obtained from the [Renewables.ninja project](https://www.renewables.ninja). However, we still provide the previous download script for reference if you want to download arbitrary MERRA-2 weather data directly. # # This script contains code that allows the download, subset and processing of [MERRA-2](http://gmao.gsfc.nasa.gov/reanalysis/MERRA-2/) datasets (provided by NASA Goddard Space Flight Center) and export them as CSV. The method describes one way to automatically obtain the desired weather data from the MERRA-2 database and simplifies resp. unifies alternative manual data obtaining methods in a single script. # ## How to use the script: # To download MERRA-2 data, you have to **register at NASA earth data portal:** # 1. Register an account at [https://urs.earthdata.nasa.gov/](https://urs.earthdata.nasa.gov/) # 2. Go to "My Applications" -> "Approve More Applications" and add _NASA GESDISC DATA ARCHIVE_ (scroll down list) # 3. Input your username and password when requested by the script # # _Hints:_ # * _Be aware that by registering you are "consenting to complete monitoring with no expectation of privacy"..._ # * _It seems that the routine sometimes has problems with usernames which include upper case letters - avoid them if you can._ # --- # # Script Setup # In[ ]: import pandas as pd import xarray as xr import numpy as np import requests import logging import yaml import json import os import hashlib import sqlalchemy from datetime import datetime from calendar import monthrange from opendap_download.multi_processing_download import DownloadManager import math from functools import partial import re import getpass from datetime import datetime, timedelta import dateutil.parser # Set up a log logging.basicConfig(level=logging.INFO) log = logging.getLogger('notebook') # --- # # Download raw data # ## Input # This part defines the input parameters according to the user and creates an URL that can download the desired MERRA-2 data via the OPeNDAP interface (see documentation notebook for information on OPeNDAP). # ### Timeframe # Definition of desired timespan the data is needed for. (only complete years) # In[ ]: # User input of timespan download_year = 2016 # Create the start date 2016-01-01 download_start_date = str(download_year) + '-01-01' # ### Geography coordinates # Definition of desired coordinates. The user has to input two corner coordinates # of a rectangular area (Format WGS84, decimal system). # * Northeast coordinate: lat_1, lon_1 # * Southwest coordinate: lat_2, lon_2 # # The area/coordinates will be converted from lat/lon to the MERRA-2 grid coordinates. # Since the resolution of the MERRA-2 grid is 0.5 x 0.625°, the given exact coordinates will # matched as close as possible. # In[ ]: # User input of coordinates # ------ # Example: Germany (lat/lon) # Northeastern point: 55.05917°N, 15.04361°E # Southwestern point: 47.27083°N, 5.86694°E # It is important to make the southwestern coordinate lat_1 and lon_1 since # the MERRA-2 portal requires it! # Southwestern coordinate # lat_1, lon_1 = 47.27083, 5.86694 Germany # Northeastern coordinate # lat_2, lon_2 = 55.05917, 15.04361 Germany # Southwestern coordinate lat_1, lon_1 = 47.27083, 5.86694 # Northeastern coordinate lat_2, lon_2 = 55.05917, 15.04361 def translate_lat_to_geos5_native(latitude): """ The source for this formula is in the MERRA2 Variable Details - File specifications for GEOS pdf file. The Grid in the documentation has points from 1 to 361 and 1 to 576. The MERRA-2 Portal uses 0 to 360 and 0 to 575. latitude: float Needs +/- instead of N/S """ return ((latitude + 90) / 0.5) def translate_lon_to_geos5_native(longitude): """See function above""" return ((longitude + 180) / 0.625) def find_closest_coordinate(calc_coord, coord_array): """ Since the resolution of the grid is 0.5 x 0.625, the 'real world' coordinates will not be matched 100% correctly. This function matches the coordinates as close as possible. """ # np.argmin() finds the smallest value in an array and returns its # index. np.abs() returns the absolute value of each item of an array. # To summarize, the function finds the difference closest to 0 and returns # its index. index = np.abs(coord_array-calc_coord).argmin() return coord_array[index] # The arrays contain the coordinates of the grid used by the API. # The values are from 0 to 360 and 0 to 575 lat_coords = np.arange(0, 361, dtype=int) lon_coords = np.arange(0, 576, dtype=int) # Translate the coordinates that define your area to grid coordinates. lat_coord_1 = translate_lat_to_geos5_native(lat_1) lon_coord_1 = translate_lon_to_geos5_native(lon_1) lat_coord_2 = translate_lat_to_geos5_native(lat_2) lon_coord_2 = translate_lon_to_geos5_native(lon_2) # Find the closest coordinate in the grid. lat_co_1_closest = find_closest_coordinate(lat_coord_1, lat_coords) lon_co_1_closest = find_closest_coordinate(lon_coord_1, lon_coords) lat_co_2_closest = find_closest_coordinate(lat_coord_2, lat_coords) lon_co_2_closest = find_closest_coordinate(lon_coord_2, lon_coords) # Check the precision of the grid coordinates. These coordinates are not lat/lon. # They are coordinates on the MERRA-2 grid. log.info('Calculated coordinates for point 1: ' + str((lat_coord_1, lon_coord_1))) log.info('Closest coordinates for point 1: ' + str((lat_co_1_closest, lon_co_1_closest))) log.info('Calculated coordinates for point 2: ' + str((lat_coord_2, lon_coord_2))) log.info('Closest coordinates for point 2: ' + str((lat_co_2_closest, lon_co_2_closest))) # ## Subsetting data # Combining parameter choices above/translation according to OPenDAP guidelines into URL-appendix # In[ ]: def translate_year_to_file_number(year): """ The file names consist of a number and a meta data string. The number changes over the years. 1980 until 1991 it is 100, 1992 until 2000 it is 200, 2001 until 2010 it is 300 and from 2011 until now it is 400. """ file_number = '' if year >= 1980 and year < 1992: file_number = '100' elif year >= 1992 and year < 2001: file_number = '200' elif year >= 2001 and year < 2011: file_number = '300' elif year >= 2011: file_number = '400' else: raise Exception('The specified year is out of range.') return file_number def generate_url_params(parameter, time_para, lat_para, lon_para): """Creates a string containing all the parameters in query form""" parameter = map(lambda x: x + time_para, parameter) parameter = map(lambda x: x + lat_para, parameter) parameter = map(lambda x: x + lon_para, parameter) return ','.join(parameter) def generate_download_links(download_years, base_url, dataset_name, url_params): """ Generates the links for the download. download_years: The years you want to download as array. dataset_name: The name of the data set. For example tavg1_2d_slv_Nx """ urls = [] for y in download_years: # build the file_number y_str = str(y) file_num = translate_year_to_file_number(download_year) for m in range(1,13): # build the month string: for the month 1 - 9 it starts with a leading 0. # zfill solves that problem m_str = str(m).zfill(2) # monthrange returns the first weekday and the number of days in a # month. Also works for leap years. _, nr_of_days = monthrange(y, m) for d in range(1,nr_of_days+1): d_str = str(d).zfill(2) # Create the file name string file_name = 'MERRA2_{num}.{name}.{y}{m}{d}.nc4'.format( num=file_num, name=dataset_name, y=y_str, m=m_str, d=d_str) # Create the query query = '{base}{y}/{m}/{name}.nc4?{params}'.format( base=base_url, y=y_str, m=m_str, name=file_name, params=url_params) urls.append(query) return urls requested_params = ['U2M', 'U10M', 'U50M', 'V2M', 'V10M', 'V50M', 'DISPH'] requested_time = '[0:1:23]' # Creates a string that looks like [start:1:end]. start and end are the lat or # lon coordinates define your area. requested_lat = '[{lat_1}:1:{lat_2}]'.format( lat_1=lat_co_1_closest, lat_2=lat_co_2_closest) requested_lon = '[{lon_1}:1:{lon_2}]'.format( lon_1=lon_co_1_closest, lon_2=lon_co_2_closest) parameter = generate_url_params(requested_params, requested_time, requested_lat, requested_lon) BASE_URL = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXSLV.5.12.4/' generated_URL = generate_download_links([download_year], BASE_URL, 'tavg1_2d_slv_Nx', parameter) # See what a query to the MERRA-2 portal looks like. log.info(generated_URL[0]) # --- # # Downloading data # This part subsequently downloads the subsetted raw data from the MERRA-2-datasets. # The download process is outsourced from the notebook, because it is a standard and repetitive process. If you are interested in the the code, see the [opendap_download module](opendap_download/). # # _Note: Each of the following steps to download the data will take a few minutes, depending on the size of geographical area and amount of data (the total download routine e.G. for Germany takes roughly 70 minutes)._ # ## Get wind data # Parameters from the dataset [tavg1_2d_slv_Nx (M2T1NXSLV)](http://goldsmr4.sci.gsfc.nasa.gov/opendap/MERRA2/M2T1NXSLV.5.12.4/contents.html) # In[ ]: # Download data (one file per day and dataset) with links to local directory. # Username and password for MERRA-2 (NASA earthdata portal) username = input('Username: ') password = getpass.getpass('Password:') # The DownloadManager is able to download files. If you have a fast internet # connection, setting this to 2 is enough. If you have slow wifi, try setting # it to 4 or 5. If you download too fast, the data portal might ban you for a # day. NUMBER_OF_CONNECTIONS = 5 # The DownloadManager class is defined in the opendap_download module. download_manager = DownloadManager() download_manager.set_username_and_password(username, password) download_manager.download_path = 'download_wind' download_manager.download_urls = generated_URL # If you want to see the download progress, check the download folder you # specified get_ipython().run_line_magic('time', 'download_manager.start_download(NUMBER_OF_CONNECTIONS)') # Download time approx. 20+ min. # ## Get roughness data # Parameters from the dataset [tavg1_2d_rad_Nx (M2T1NXRAD)](https://goldsmr4.sci.gsfc.nasa.gov/opendap/MERRA2/M2T1NXRAD.5.12.4/contents.html) # In[ ]: # Roughness data is in a different data set. The parameter is called Z0M. roughness_para = generate_url_params(['Z0M'], requested_time, requested_lat, requested_lon) ROUGHNESS_BASE_URL = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXFLX.5.12.4/' roughness_links = generate_download_links([download_year], ROUGHNESS_BASE_URL, 'tavg1_2d_flx_Nx', roughness_para) download_manager.download_path = 'download_roughness' download_manager.download_urls = roughness_links # If you want to see the download progress, check the download folder you # specified. get_ipython().run_line_magic('time', 'download_manager.start_download(NUMBER_OF_CONNECTIONS)') # Download time approx. 12+ min. # ## Get radiation data # Parameters from the dataset [tavg1_2d_flx_Nx (M2T1NXFLX)](http://goldsmr4.sci.gsfc.nasa.gov/opendap/MERRA2/M2T1NXFLX.5.12.4/contents.html) # In[ ]: # Parameters SWGDN and SWTDN radiation_para = generate_url_params(['SWGDN', 'SWTDN'], requested_time, requested_lat, requested_lon) RADIATION_BASE_URL = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXRAD.5.12.4/' radiation_links = generate_download_links([download_year], RADIATION_BASE_URL, 'tavg1_2d_rad_Nx', radiation_para) download_manager.download_path = 'download_radiation' download_manager.download_urls = radiation_links get_ipython().run_line_magic('time', 'download_manager.start_download(NUMBER_OF_CONNECTIONS)') # Download time approx. 8+ min. # ## Get temperature data # Parameters from the dataset [tavg1_2d_slv_Nx (M2T1NXSLV)](http://goldsmr4.sci.gsfc.nasa.gov/opendap/MERRA2/M2T1NXSLV.5.12.4/contents.html) # In[ ]: # Parameter T2M (i.e. the temperature 2 meters above displacement height) temperature_para = generate_url_params(['T2M'], requested_time, requested_lat, requested_lon) TEMPERATURE_BASE_URL = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXSLV.5.12.4/' temperature_links = generate_download_links([download_year], TEMPERATURE_BASE_URL, 'tavg1_2d_slv_Nx', temperature_para) download_manager.download_path = 'download_temperature' download_manager.download_urls = temperature_links get_ipython().run_line_magic('time', 'download_manager.start_download(NUMBER_OF_CONNECTIONS)') # Download time approx. 13+ min. # ## Get air density data # Parameters from the dataset [tavg1_2d_flx_Nx (M2T1NXFLX)](http://goldsmr4.sci.gsfc.nasa.gov/opendap/MERRA2/M2T1NXFLX.5.12.4/contents.html) # In[ ]: # Parameter RHOA density_para = generate_url_params(['RHOA'], requested_time, requested_lat, requested_lon) DENSITY_BASE_URL = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXFLX.5.12.4/' density_links = generate_download_links([download_year], DENSITY_BASE_URL, 'tavg1_2d_flx_Nx', density_para) download_manager.download_path = 'download_density' download_manager.download_urls = density_links get_ipython().run_line_magic('time', 'download_manager.start_download(NUMBER_OF_CONNECTIONS)') # Download time approx. 13+ min. # ## Get air pressure data # Parameters from the dataset [tavg1_2d_slv_Nx (M2T1NXSLV)](http://goldsmr4.sci.gsfc.nasa.gov/opendap/MERRA2/M2T1NXSLV.5.12.4/contents.html) # In[ ]: # Parameters PS pressure_para = generate_url_params(['PS'], requested_time, requested_lat, requested_lon) PRESSURE_BASE_URL = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXSLV.5.12.4/' pressure_links = generate_download_links([download_year], PRESSURE_BASE_URL, 'tavg1_2d_slv_Nx', pressure_para) download_manager.download_path = 'download_pressure' download_manager.download_urls = pressure_links get_ipython().run_line_magic('time', 'download_manager.start_download(NUMBER_OF_CONNECTIONS)') # Download time approx. 15+ min. # ## Get lat and lon dimensions # For now, the dataset only has MERRA-2 grid coordinates. To translate the points # back to "real world" coordinates, the data portal offers a dimension scale file. # In[ ]: # The dimensions map the MERRA2 grid coordinates to lat/lon. The coordinates # to request are 0:360 wheare as the other coordinates are 1:361 requested_lat_dim = '[{lat_1}:1:{lat_2}]'.format( lat_1=lat_co_1_closest, lat_2=lat_co_2_closest) requested_lon_dim = '[{lon_1}:1:{lon_2}]'.format( lon_1=lon_co_1_closest , lon_2=lon_co_2_closest ) lat_lon_dimension_para = 'lat' + requested_lat_dim + ',lon' + requested_lon_dim # Creating the download url. dimension_url = 'https://goldsmr4.gesdisc.eosdis.nasa.gov/opendap/MERRA2/M2T1NXSLV.5.12.4/2014/01/MERRA2_400.tavg1_2d_slv_Nx.20140101.nc4.nc4?' dimension_url = dimension_url + lat_lon_dimension_para download_manager.download_path = 'dimension_scale' download_manager.download_urls = [dimension_url] # Since the dimension is only one file, we only need one connection. get_ipython().run_line_magic('time', 'download_manager.start_download(1)') # ## Check the precision of the downloaded data # Due to the back and forth conversion from "real world" coordinates to MERRA-2 grid points, # this part helps you to check if the conversion was precise enough. # In[ ]: file_path = os.path.join('dimension_scale', DownloadManager.get_filename( dimension_url)) with xr.open_dataset(file_path) as ds_dim: df_dim = ds_dim.to_dataframe() lat_array = ds_dim['lat'].data.tolist() lon_array = ds_dim['lon'].data.tolist() # The log output helps evaluating the precision of the received data. log.info('Requested lat: ' + str((lat_1, lat_2))) log.info('Received lat: ' + str(lat_array)) log.info('Requested lon: ' + str((lon_1, lon_2))) log.info('Received lon: ' + str(lon_array)) # --- # # Setting up the DataFrame # This part sets up a DataFrame and reads the raw data into it. First the wind data and adding the remaining data afterwards. # In[ ]: def extract_date(data_set): """ Extracts the date from the filename before merging the datasets. """ try: # The attribute name changed during the development of this script # from HDF5_Global.Filename to Filename. if 'HDF5_GLOBAL.Filename' in data_set.attrs: f_name = data_set.attrs['HDF5_GLOBAL.Filename'] elif 'Filename' in data_set.attrs: f_name = data_set.attrs['Filename'] else: raise AttributeError('The attribute name has changed again!') # find a match between "." and ".nc4" that does not have "." . exp = r'(?<=\.)[^\.]*(?=\.nc4)' res = re.search(exp, f_name).group(0) # Extract the date. y, m, d = res[0:4], res[4:6], res[6:8] date_str = ('%s-%s-%s' % (y, m, d)) data_set = data_set.assign(date=date_str) return data_set except KeyError: # The last dataset is the one all the other sets will be merged into. # Therefore, no date can be extracted. return data_set file_path = os.path.join('download_wind', '*.nc4') try: with xr.open_mfdataset(file_path, concat_dim='date', preprocess=extract_date) as ds_wind: print(ds_wind) df_wind = ds_wind.to_dataframe() except xr.MergeError as e: print(e) # In[ ]: df_wind.reset_index(inplace=True) # In[ ]: start_date = datetime.strptime(download_start_date, '%Y-%m-%d') def calculate_datetime(d_frame): """ Calculates the accumulated hour based on the date. """ cur_date = datetime.strptime(d_frame['date'], '%Y-%m-%d') hour = int(d_frame['time']) delta = abs(cur_date - start_date).days date_time_value = (delta * 24) + (hour) return date_time_value df_wind['date_time_hours'] = df_wind.apply(calculate_datetime, axis=1) df_wind # ## Converting the timeformat to ISO 8601 # In[ ]: def converting_timeformat_to_ISO8601(row): """Generates datetime according to ISO 8601 (UTC)""" date = dateutil.parser.parse(row['date']) hour = int(row['time']) # timedelta from the datetime module enables the programmer # to add time to a date. date_time = date + timedelta(hours = hour) return str(date_time.isoformat()) + 'Z' # MERRA2 datasets have UTC time zone. df_wind['date_utc'] = df_wind.apply(converting_timeformat_to_ISO8601, axis=1) df_wind['date_utc'] # execution time approx. 3+ min # ## Converting wind vectors to wind speed # This part uses the given wind vectors in the MERRA-2 original data to calculate a wind speed (vector addition). # In[ ]: def calculate_windspeed(d_frame, idx_u, idx_v): """ Calculates the windspeed. The returned unit is m/s """ um = float(d_frame[idx_u]) vm = float(d_frame[idx_v]) speed = math.sqrt((um ** 2) + (vm ** 2)) return round(speed, 2) # partial is used to create a function with pre-set arguments. calc_windspeed_2m = partial(calculate_windspeed, idx_u='U2M', idx_v='V2M') calc_windspeed_10m = partial(calculate_windspeed, idx_u='U10M', idx_v='V10M') calc_windspeed_50m = partial(calculate_windspeed, idx_u='U50M', idx_v='V50M') df_wind['v_2m'] = df_wind.apply(calc_windspeed_2m, axis=1) df_wind['v_10m']= df_wind.apply(calc_windspeed_10m, axis=1) df_wind['v_50m'] = df_wind.apply(calc_windspeed_50m, axis=1) df_wind # execution time approx. 3 min # ## Setting up data Frame for roughness, radiation, temperature and air parameters # In[ ]: file_path = os.path.join('download_roughness', '*.nc4') with xr.open_mfdataset(file_path, concat_dim='date', preprocess=extract_date) as ds_rough: df_rough = ds_rough.to_dataframe() df_rough.reset_index(inplace=True) # In[ ]: file_path = os.path.join('download_radiation', '*.nc4') try: with xr.open_mfdataset(file_path, concat_dim='date', preprocess=extract_date) as ds_rad: print(ds_rad) df_rad = ds_rad.to_dataframe() except xr.MergeError as e: print(e) df_rad.reset_index(inplace=True) # In[ ]: file_path = os.path.join('download_temperature', '*.nc4') try: with xr.open_mfdataset(file_path, concat_dim='date', preprocess=extract_date) as ds_temp: print(ds_temp) df_temp = ds_temp.to_dataframe() except xr.MergeError as e: print(e) df_temp.reset_index(inplace=True) # In[ ]: file_path = os.path.join('download_density', '*.nc4') try: with xr.open_mfdataset(file_path, concat_dim='date', preprocess=extract_date) as ds_dens: print(ds_dens) df_dens = ds_dens.to_dataframe() except xr.MergeError as e: print(e) df_dens.reset_index(inplace=True) # In[ ]: file_path = os.path.join('download_pressure', '*.nc4') try: with xr.open_mfdataset(file_path, concat_dim='date', preprocess=extract_date) as ds_pres: print(ds_pres) df_pres = ds_pres.to_dataframe() except xr.MergeError as e: print(e) df_pres.reset_index(inplace=True) # ## Combining the data Frames # In[ ]: df = pd.merge(df_wind, df_rough, on=['date', 'lat', 'lon', 'time']) df = pd.merge(df, df_rad, on=['date', 'lat', 'lon', 'time']) df = pd.merge(df, df_temp, on=['date', 'lat', 'lon', 'time']) df = pd.merge(df, df_dens, on=['date', 'lat', 'lon', 'time']) df = pd.merge(df, df_pres, on=['date', 'lat', 'lon', 'time']) df.info() # --- # # Structure the dataframe, add and remove columns # ## Calculating the displacement height # The so-called "displacement height" is the height # > _"[...] at which zero wind speed is achieved as a result of flow obstacles such as trees or buildings. It is generally approximated as 2/3 of the average height of the obstacles. For example, if estimating winds over a forest canopy of height h = 30 m, the zero-plane displacement would be d = 20 m."_ ([Source](https://en.wikipedia.org/wiki/Log_wind_profile#Definition)) # In[ ]: # Calculate height for h1 (displacement height +2m) and h2 (displacement height # +10m). df['h1'] = df.apply((lambda x:int(x['DISPH']) + 2), axis=1) df['h2'] = df.apply((lambda x:int(x['DISPH']) + 10), axis=1) # ## Adding needed and removing not needed columns # In[ ]: df.drop('DISPH', axis=1, inplace=True) df.drop(['time', 'date'], axis=1, inplace=True) df.drop(['U2M', 'U10M', 'U50M', 'V2M', 'V10M', 'V50M'], axis=1, inplace=True) df['lat'] = df['lat'].apply(lambda x: lat_array[int(x)]) df['lon'] = df['lon'].apply(lambda x: lon_array[int(x)]) # ## Renaming and sorting columns # In[ ]: rename_map = {'date_time_hours': 'cumulated hours', 'date_utc': 'timestamp', 'v_2m': 'v1', 'v_10m': 'v2', 'Z0M': 'z0', 'T2M': 'T', 'RHOA': 'rho', 'PS': 'p' } df.rename(columns=rename_map, inplace=True) # In[ ]: # Change order of the columns columns = ['timestamp', 'cumulated hours', 'lat', 'lon', 'v1', 'v2', 'v_50m', 'h1', 'h2', 'z0', 'SWTDN', 'SWGDN', 'T', 'rho', 'p'] df = df[columns] # ## First look at the final data Frame # In[ ]: df.info() # In[ ]: df # ## Save as CSV # In[ ]: df.to_csv('weather_data_GER_2016.csv', index=False)